{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 7588, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005271481286241434, "grad_norm": 195.0327897067415, "learning_rate": 9.988139167105956e-07, "logits/chosen": 0.906176745891571, "logits/rejected": 0.9178711175918579, "logps/chosen": -437.1499938964844, "logps/rejected": -403.25, "loss": 0.7015, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": 0.15147705376148224, "rewards/margins": -0.0005371093866415322, "rewards/rejected": 0.15204162895679474, "step": 10 }, { "epoch": 0.010542962572482868, "grad_norm": 125.94219298511759, "learning_rate": 9.974960463890353e-07, "logits/chosen": 0.892578125, "logits/rejected": 0.8863769769668579, "logps/chosen": -327.04998779296875, "logps/rejected": -292.45001220703125, "loss": 0.6776, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014907836448401213, "rewards/margins": 0.07001800835132599, "rewards/rejected": -0.07148132473230362, "step": 20 }, { "epoch": 0.0158144438587243, "grad_norm": 125.76730832797801, "learning_rate": 9.96178176067475e-07, "logits/chosen": 0.972363293170929, "logits/rejected": 1.0988280773162842, "logps/chosen": -313.42498779296875, "logps/rejected": -277.8500061035156, "loss": 0.6811, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.15717773139476776, "rewards/margins": 0.09623412787914276, "rewards/rejected": 0.06069183349609375, "step": 30 }, { "epoch": 0.021085925144965736, "grad_norm": 136.152207838109, "learning_rate": 9.948603057459145e-07, "logits/chosen": 1.127539038658142, "logits/rejected": 1.0646483898162842, "logps/chosen": -421.5, "logps/rejected": -347.54998779296875, "loss": 0.6683, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.706347644329071, "rewards/margins": 0.2527709901332855, "rewards/rejected": 0.45307618379592896, "step": 40 }, { "epoch": 0.02635740643120717, "grad_norm": 150.44979080692877, "learning_rate": 9.935424354243542e-07, "logits/chosen": 0.98095703125, "logits/rejected": 0.9649413824081421, "logps/chosen": -351.20001220703125, "logps/rejected": -320.6499938964844, "loss": 0.6495, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.38414305448532104, "rewards/margins": 0.259765625, "rewards/rejected": 0.12401123344898224, "step": 50 }, { "epoch": 0.0316288877174486, "grad_norm": 175.23509223396127, "learning_rate": 9.922245651027939e-07, "logits/chosen": 0.92724609375, "logits/rejected": 0.877490222454071, "logps/chosen": -429.3999938964844, "logps/rejected": -366.79998779296875, "loss": 0.5866, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.18220825493335724, "rewards/margins": 0.435220330953598, "rewards/rejected": -0.25286865234375, "step": 60 }, { "epoch": 0.03690036900369004, "grad_norm": 146.7188839059792, "learning_rate": 9.909066947812334e-07, "logits/chosen": 0.686279296875, "logits/rejected": 0.7569335699081421, "logps/chosen": -387.20001220703125, "logps/rejected": -361.79998779296875, "loss": 0.7296, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.47172850370407104, "rewards/margins": 0.13023681938648224, "rewards/rejected": -0.601367175579071, "step": 70 }, { "epoch": 0.04217185028993147, "grad_norm": 113.38318135140956, "learning_rate": 9.895888244596733e-07, "logits/chosen": 0.8501952886581421, "logits/rejected": 0.8690429925918579, "logps/chosen": -345.8999938964844, "logps/rejected": -273.0, "loss": 0.6035, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08529357612133026, "rewards/margins": 0.2972656190395355, "rewards/rejected": -0.3826446533203125, "step": 80 }, { "epoch": 0.047443331576172906, "grad_norm": 124.1805437083236, "learning_rate": 9.882709541381128e-07, "logits/chosen": 0.818164050579071, "logits/rejected": 0.813671886920929, "logps/chosen": -342.70001220703125, "logps/rejected": -338.95001220703125, "loss": 0.6591, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.02936859056353569, "rewards/margins": 0.2746948301792145, "rewards/rejected": -0.24531249701976776, "step": 90 }, { "epoch": 0.05271481286241434, "grad_norm": 121.7136048585754, "learning_rate": 9.869530838165525e-07, "logits/chosen": 0.8543945550918579, "logits/rejected": 0.8369385004043579, "logps/chosen": -349.6499938964844, "logps/rejected": -316.45001220703125, "loss": 0.6063, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07845459133386612, "rewards/margins": 0.3776184022426605, "rewards/rejected": -0.45637816190719604, "step": 100 }, { "epoch": 0.05798629414865577, "grad_norm": 126.57903837054607, "learning_rate": 9.85635213494992e-07, "logits/chosen": 0.72607421875, "logits/rejected": 0.783398449420929, "logps/chosen": -411.95001220703125, "logps/rejected": -364.3500061035156, "loss": 0.6164, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11612548679113388, "rewards/margins": 0.37086182832717896, "rewards/rejected": -0.4865966737270355, "step": 110 }, { "epoch": 0.0632577754348972, "grad_norm": 107.12627636980832, "learning_rate": 9.843173431734316e-07, "logits/chosen": 0.8062988519668579, "logits/rejected": 0.7843017578125, "logps/chosen": -329.3500061035156, "logps/rejected": -315.6499938964844, "loss": 0.6434, "rewards/accuracies": 0.65625, "rewards/chosen": -0.013018799014389515, "rewards/margins": 0.32861328125, "rewards/rejected": -0.3416503965854645, "step": 120 }, { "epoch": 0.06852925672113865, "grad_norm": 197.30033553565067, "learning_rate": 9.829994728518713e-07, "logits/chosen": 0.8397461175918579, "logits/rejected": 0.7993408441543579, "logps/chosen": -367.04998779296875, "logps/rejected": -340.6000061035156, "loss": 0.6955, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.10221557319164276, "rewards/margins": 0.23583984375, "rewards/rejected": -0.1337745636701584, "step": 130 }, { "epoch": 0.07380073800738007, "grad_norm": 129.2179247630524, "learning_rate": 9.81681602530311e-07, "logits/chosen": 0.841015636920929, "logits/rejected": 0.839550793170929, "logps/chosen": -354.3500061035156, "logps/rejected": -328.29998779296875, "loss": 0.6146, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.3588500916957855, "rewards/margins": 0.4165283143520355, "rewards/rejected": -0.05777587741613388, "step": 140 }, { "epoch": 0.0790722192936215, "grad_norm": 118.89511819979712, "learning_rate": 9.803637322087505e-07, "logits/chosen": 0.9994140863418579, "logits/rejected": 1.002832055091858, "logps/chosen": -369.6000061035156, "logps/rejected": -335.75, "loss": 0.6239, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.30265504121780396, "rewards/margins": 0.38221436738967896, "rewards/rejected": -0.07959900051355362, "step": 150 }, { "epoch": 0.08434370057986294, "grad_norm": 112.89677323564155, "learning_rate": 9.790458618871902e-07, "logits/chosen": 0.759228527545929, "logits/rejected": 0.7759765386581421, "logps/chosen": -389.45001220703125, "logps/rejected": -369.25, "loss": 0.5658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08851318061351776, "rewards/margins": 0.6343749761581421, "rewards/rejected": -0.723559558391571, "step": 160 }, { "epoch": 0.08961518186610437, "grad_norm": 142.51130950601535, "learning_rate": 9.7772799156563e-07, "logits/chosen": 0.592480480670929, "logits/rejected": 0.660937488079071, "logps/chosen": -415.45001220703125, "logps/rejected": -399.0, "loss": 0.7043, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.32015687227249146, "rewards/margins": 0.2957763671875, "rewards/rejected": -0.616162121295929, "step": 170 }, { "epoch": 0.09488666315234581, "grad_norm": 103.40424754433687, "learning_rate": 9.764101212440694e-07, "logits/chosen": 0.752490222454071, "logits/rejected": 0.797869861125946, "logps/chosen": -367.1499938964844, "logps/rejected": -367.70001220703125, "loss": 0.6042, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10686035454273224, "rewards/margins": 0.4187377989292145, "rewards/rejected": -0.3116210997104645, "step": 180 }, { "epoch": 0.10015814443858724, "grad_norm": 116.0299610220215, "learning_rate": 9.750922509225091e-07, "logits/chosen": 0.801953136920929, "logits/rejected": 0.92431640625, "logps/chosen": -353.70001220703125, "logps/rejected": -326.95001220703125, "loss": 0.6213, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.46259766817092896, "rewards/margins": 0.4621826112270355, "rewards/rejected": 0.00017089843458961695, "step": 190 }, { "epoch": 0.10542962572482868, "grad_norm": 160.21937325342105, "learning_rate": 9.737743806009488e-07, "logits/chosen": 0.84619140625, "logits/rejected": 0.705761730670929, "logps/chosen": -384.54998779296875, "logps/rejected": -336.1499938964844, "loss": 0.5789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.43000489473342896, "rewards/margins": 0.551770031452179, "rewards/rejected": -0.12111816555261612, "step": 200 }, { "epoch": 0.11070110701107011, "grad_norm": 207.48003877469768, "learning_rate": 9.724565102793885e-07, "logits/chosen": 0.8213866949081421, "logits/rejected": 0.6983886957168579, "logps/chosen": -374.70001220703125, "logps/rejected": -347.70001220703125, "loss": 0.6187, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3440795838832855, "rewards/margins": 0.6158447265625, "rewards/rejected": -0.27174681425094604, "step": 210 }, { "epoch": 0.11597258829731154, "grad_norm": 121.41017950781712, "learning_rate": 9.71138639957828e-07, "logits/chosen": 0.86669921875, "logits/rejected": 0.7954956293106079, "logps/chosen": -371.8999938964844, "logps/rejected": -341.1000061035156, "loss": 0.6572, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.44932860136032104, "rewards/margins": 0.3929687440395355, "rewards/rejected": 0.05634155124425888, "step": 220 }, { "epoch": 0.12124406958355298, "grad_norm": 95.51288417901466, "learning_rate": 9.698207696362677e-07, "logits/chosen": 0.8985351324081421, "logits/rejected": 0.9034179449081421, "logps/chosen": -378.20001220703125, "logps/rejected": -314.70001220703125, "loss": 0.5516, "rewards/accuracies": 0.625, "rewards/chosen": 0.681201159954071, "rewards/margins": 0.588757336139679, "rewards/rejected": 0.0931396484375, "step": 230 }, { "epoch": 0.1265155508697944, "grad_norm": 132.33833879430284, "learning_rate": 9.685028993147074e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.783496081829071, "logps/chosen": -397.54998779296875, "logps/rejected": -390.95001220703125, "loss": 0.6139, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.56689453125, "rewards/margins": 0.5779358148574829, "rewards/rejected": -0.010714721865952015, "step": 240 }, { "epoch": 0.13178703215603585, "grad_norm": 134.1788473685257, "learning_rate": 9.671850289931471e-07, "logits/chosen": 0.80517578125, "logits/rejected": 0.926562488079071, "logps/chosen": -342.04998779296875, "logps/rejected": -330.45001220703125, "loss": 0.6072, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14167480170726776, "rewards/margins": 0.504321277141571, "rewards/rejected": -0.3620956540107727, "step": 250 }, { "epoch": 0.1370585134422773, "grad_norm": 113.68981479265169, "learning_rate": 9.658671586715866e-07, "logits/chosen": 0.8184570074081421, "logits/rejected": 0.8117920160293579, "logps/chosen": -382.29998779296875, "logps/rejected": -371.1499938964844, "loss": 0.6708, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20853272080421448, "rewards/margins": 0.4186157286167145, "rewards/rejected": -0.627148449420929, "step": 260 }, { "epoch": 0.1423299947285187, "grad_norm": 93.04308258395385, "learning_rate": 9.645492883500263e-07, "logits/chosen": 0.7610107660293579, "logits/rejected": 0.755322277545929, "logps/chosen": -339.57501220703125, "logps/rejected": -333.1499938964844, "loss": 0.5597, "rewards/accuracies": 0.75, "rewards/chosen": 0.09686584770679474, "rewards/margins": 0.56927490234375, "rewards/rejected": -0.4723877012729645, "step": 270 }, { "epoch": 0.14760147601476015, "grad_norm": 134.37010902157917, "learning_rate": 9.63231418028466e-07, "logits/chosen": 0.6902099847793579, "logits/rejected": 0.6231445074081421, "logps/chosen": -379.25, "logps/rejected": -367.70001220703125, "loss": 0.5604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.25622254610061646, "rewards/margins": 0.6239013671875, "rewards/rejected": -0.3672729432582855, "step": 280 }, { "epoch": 0.1528729573010016, "grad_norm": 100.94389867858813, "learning_rate": 9.619135477069055e-07, "logits/chosen": 0.7251952886581421, "logits/rejected": 0.7589111328125, "logps/chosen": -332.04998779296875, "logps/rejected": -343.75, "loss": 0.6076, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.17520752549171448, "rewards/margins": 0.5380859375, "rewards/rejected": -0.36317747831344604, "step": 290 }, { "epoch": 0.158144438587243, "grad_norm": 131.77616028096944, "learning_rate": 9.605956773853452e-07, "logits/chosen": 0.6480957269668579, "logits/rejected": 0.6964355707168579, "logps/chosen": -380.8999938964844, "logps/rejected": -352.79998779296875, "loss": 0.6218, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3324829041957855, "rewards/margins": 0.666015625, "rewards/rejected": -0.3333740234375, "step": 300 }, { "epoch": 0.16341591987348444, "grad_norm": 145.56830792360296, "learning_rate": 9.59277807063785e-07, "logits/chosen": 0.635113537311554, "logits/rejected": 0.686816394329071, "logps/chosen": -333.54998779296875, "logps/rejected": -344.8500061035156, "loss": 0.6699, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.13051147758960724, "rewards/margins": 0.4708007872104645, "rewards/rejected": -0.3400207459926605, "step": 310 }, { "epoch": 0.16868740115972589, "grad_norm": 124.975058560379, "learning_rate": 9.579599367422246e-07, "logits/chosen": 0.606689453125, "logits/rejected": 0.5343993902206421, "logps/chosen": -385.0, "logps/rejected": -337.70001220703125, "loss": 0.5938, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04933471605181694, "rewards/margins": 0.5774780511856079, "rewards/rejected": -0.626538097858429, "step": 320 }, { "epoch": 0.17395888244596733, "grad_norm": 81.0556062958144, "learning_rate": 9.56642066420664e-07, "logits/chosen": 0.8160644769668579, "logits/rejected": 0.61474609375, "logps/chosen": -376.79998779296875, "logps/rejected": -359.54998779296875, "loss": 0.6469, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.10790710151195526, "rewards/margins": 0.561553955078125, "rewards/rejected": -0.45366209745407104, "step": 330 }, { "epoch": 0.17923036373220874, "grad_norm": 71.09234786140391, "learning_rate": 9.553241960991038e-07, "logits/chosen": 0.92724609375, "logits/rejected": 0.950024425983429, "logps/chosen": -381.04998779296875, "logps/rejected": -322.8999938964844, "loss": 0.5922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5098465085029602, "rewards/margins": 0.513842761516571, "rewards/rejected": -0.004122924990952015, "step": 340 }, { "epoch": 0.18450184501845018, "grad_norm": 150.1532419739584, "learning_rate": 9.540063257775435e-07, "logits/chosen": 1.0613281726837158, "logits/rejected": 1.021948218345642, "logps/chosen": -325.79998779296875, "logps/rejected": -338.6499938964844, "loss": 0.6349, "rewards/accuracies": 0.65625, "rewards/chosen": 1.055078148841858, "rewards/margins": 0.47600096464157104, "rewards/rejected": 0.578137218952179, "step": 350 }, { "epoch": 0.18977332630469163, "grad_norm": 111.74623015402356, "learning_rate": 9.526884554559831e-07, "logits/chosen": 0.850537121295929, "logits/rejected": 0.9039062261581421, "logps/chosen": -328.3500061035156, "logps/rejected": -286.54998779296875, "loss": 0.577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.81787109375, "rewards/margins": 0.646777331829071, "rewards/rejected": 0.17006225883960724, "step": 360 }, { "epoch": 0.19504480759093304, "grad_norm": 115.90283220056664, "learning_rate": 9.513705851344227e-07, "logits/chosen": 0.984179675579071, "logits/rejected": 1.0642578601837158, "logps/chosen": -325.95001220703125, "logps/rejected": -322.0, "loss": 0.5769, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.7190917730331421, "rewards/margins": 0.53515625, "rewards/rejected": 0.18483276665210724, "step": 370 }, { "epoch": 0.20031628887717448, "grad_norm": 147.26681826730805, "learning_rate": 9.500527148128624e-07, "logits/chosen": 0.9056457281112671, "logits/rejected": 0.8919922113418579, "logps/chosen": -390.1000061035156, "logps/rejected": -344.3999938964844, "loss": 0.5834, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8731445074081421, "rewards/margins": 0.6744323968887329, "rewards/rejected": 0.19941100478172302, "step": 380 }, { "epoch": 0.20558777016341592, "grad_norm": 104.88618473885907, "learning_rate": 9.487348444913021e-07, "logits/chosen": 0.872753918170929, "logits/rejected": 0.7909179925918579, "logps/chosen": -359.0, "logps/rejected": -315.20001220703125, "loss": 0.6096, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6266815066337585, "rewards/margins": 0.642871081829071, "rewards/rejected": -0.0159912109375, "step": 390 }, { "epoch": 0.21085925144965736, "grad_norm": 110.68887486371982, "learning_rate": 9.474169741697417e-07, "logits/chosen": 1.082617163658142, "logits/rejected": 0.959765613079071, "logps/chosen": -358.70001220703125, "logps/rejected": -327.3999938964844, "loss": 0.6155, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.739837646484375, "rewards/margins": 0.613525390625, "rewards/rejected": 0.12644653022289276, "step": 400 }, { "epoch": 0.21613073273589878, "grad_norm": 122.53486204848755, "learning_rate": 9.460991038481813e-07, "logits/chosen": 0.840576171875, "logits/rejected": 0.7552245855331421, "logps/chosen": -364.57501220703125, "logps/rejected": -358.1499938964844, "loss": 0.6215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5157226324081421, "rewards/margins": 0.7669922113418579, "rewards/rejected": -0.24955444037914276, "step": 410 }, { "epoch": 0.22140221402214022, "grad_norm": 156.98991232396278, "learning_rate": 9.44781233526621e-07, "logits/chosen": 0.745678722858429, "logits/rejected": 0.7955566644668579, "logps/chosen": -350.8999938964844, "logps/rejected": -386.6499938964844, "loss": 0.6322, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10751952975988388, "rewards/margins": 0.639697253704071, "rewards/rejected": -0.53179931640625, "step": 420 }, { "epoch": 0.22667369530838166, "grad_norm": 85.41325234533689, "learning_rate": 9.434633632050606e-07, "logits/chosen": 0.7496093511581421, "logits/rejected": 0.7322753667831421, "logps/chosen": -332.6499938964844, "logps/rejected": -318.29998779296875, "loss": 0.515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22052916884422302, "rewards/margins": 0.8679565191268921, "rewards/rejected": -0.6474609375, "step": 430 }, { "epoch": 0.23194517659462308, "grad_norm": 118.17259465155605, "learning_rate": 9.421454928835002e-07, "logits/chosen": 0.7743164300918579, "logits/rejected": 0.938671886920929, "logps/chosen": -368.0, "logps/rejected": -351.8500061035156, "loss": 0.6099, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.563427746295929, "rewards/margins": 0.559313952922821, "rewards/rejected": 0.0028732300270348787, "step": 440 }, { "epoch": 0.23721665788086452, "grad_norm": 129.92309110225594, "learning_rate": 9.408276225619399e-07, "logits/chosen": 0.863476574420929, "logits/rejected": 0.9012695550918579, "logps/chosen": -355.0, "logps/rejected": -332.79998779296875, "loss": 0.5878, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.750170886516571, "rewards/margins": 0.6548095941543579, "rewards/rejected": 0.09470214694738388, "step": 450 }, { "epoch": 0.24248813916710596, "grad_norm": 146.36595590103929, "learning_rate": 9.395097522403796e-07, "logits/chosen": 0.83544921875, "logits/rejected": 0.9013671875, "logps/chosen": -367.70001220703125, "logps/rejected": -365.8500061035156, "loss": 0.5688, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.84326171875, "rewards/margins": 0.7145751714706421, "rewards/rejected": 0.12911376357078552, "step": 460 }, { "epoch": 0.2477596204533474, "grad_norm": 119.57275763840259, "learning_rate": 9.381918819188192e-07, "logits/chosen": 0.626538097858429, "logits/rejected": 0.6456054449081421, "logps/chosen": -382.0, "logps/rejected": -337.3500061035156, "loss": 0.5882, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.580810546875, "rewards/margins": 0.8094726800918579, "rewards/rejected": -0.22788086533546448, "step": 470 }, { "epoch": 0.2530311017395888, "grad_norm": 109.18271302507178, "learning_rate": 9.368740115972587e-07, "logits/chosen": 0.90087890625, "logits/rejected": 0.895214855670929, "logps/chosen": -381.79998779296875, "logps/rejected": -333.57501220703125, "loss": 0.5955, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.38765257596969604, "rewards/margins": 0.681195080280304, "rewards/rejected": -0.293182373046875, "step": 480 }, { "epoch": 0.25830258302583026, "grad_norm": 120.33921404967042, "learning_rate": 9.355561412756983e-07, "logits/chosen": 0.750292956829071, "logits/rejected": 0.8297363519668579, "logps/chosen": -330.45001220703125, "logps/rejected": -316.70001220703125, "loss": 0.604, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8183349370956421, "rewards/margins": 0.7877441644668579, "rewards/rejected": 0.02974853478372097, "step": 490 }, { "epoch": 0.2635740643120717, "grad_norm": 120.26616902193162, "learning_rate": 9.34238270954138e-07, "logits/chosen": 0.894726574420929, "logits/rejected": 0.893261730670929, "logps/chosen": -331.25, "logps/rejected": -302.79998779296875, "loss": 0.6142, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.638629138469696, "rewards/margins": 0.6368774175643921, "rewards/rejected": 0.0018127441871911287, "step": 500 }, { "epoch": 0.26884554559831314, "grad_norm": 131.2325191209508, "learning_rate": 9.329204006325777e-07, "logits/chosen": 0.9541991949081421, "logits/rejected": 0.8610595464706421, "logps/chosen": -363.29998779296875, "logps/rejected": -362.29998779296875, "loss": 0.6414, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8794921636581421, "rewards/margins": 0.6828368902206421, "rewards/rejected": 0.1963043212890625, "step": 510 }, { "epoch": 0.2741170268845546, "grad_norm": 141.15304470771073, "learning_rate": 9.316025303110173e-07, "logits/chosen": 0.7618163824081421, "logits/rejected": 0.726757824420929, "logps/chosen": -357.25, "logps/rejected": -332.79998779296875, "loss": 0.5791, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.7579101324081421, "rewards/margins": 0.6947876214981079, "rewards/rejected": 0.06324692070484161, "step": 520 }, { "epoch": 0.27938850817079597, "grad_norm": 170.53239520640906, "learning_rate": 9.30284659989457e-07, "logits/chosen": 0.4786315858364105, "logits/rejected": 0.532336413860321, "logps/chosen": -388.8999938964844, "logps/rejected": -342.70001220703125, "loss": 0.5381, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.9263671636581421, "rewards/rejected": -0.679974377155304, "step": 530 }, { "epoch": 0.2846599894570374, "grad_norm": 114.46795481219831, "learning_rate": 9.289667896678966e-07, "logits/chosen": 0.7227538824081421, "logits/rejected": 0.7452148199081421, "logps/chosen": -318.75, "logps/rejected": -319.1000061035156, "loss": 0.6033, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.35774534940719604, "rewards/margins": 0.5657714605331421, "rewards/rejected": -0.20771484076976776, "step": 540 }, { "epoch": 0.28993147074327885, "grad_norm": 82.37940129622947, "learning_rate": 9.276489193463362e-07, "logits/chosen": 0.798779308795929, "logits/rejected": 0.795214831829071, "logps/chosen": -377.95001220703125, "logps/rejected": -342.04998779296875, "loss": 0.6413, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6808837652206421, "rewards/margins": 0.6385132074356079, "rewards/rejected": 0.04270629957318306, "step": 550 }, { "epoch": 0.2952029520295203, "grad_norm": 105.27633841648901, "learning_rate": 9.263310490247759e-07, "logits/chosen": 0.7159179449081421, "logits/rejected": 0.628955066204071, "logps/chosen": -387.79998779296875, "logps/rejected": -362.04998779296875, "loss": 0.5283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.49605101346969604, "rewards/margins": 0.835400402545929, "rewards/rejected": -0.33894044160842896, "step": 560 }, { "epoch": 0.30047443331576174, "grad_norm": 110.47625390104693, "learning_rate": 9.250131787032156e-07, "logits/chosen": 0.577349841594696, "logits/rejected": 0.5787597894668579, "logps/chosen": -406.54998779296875, "logps/rejected": -353.3500061035156, "loss": 0.5897, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01650390587747097, "rewards/margins": 0.8148437738418579, "rewards/rejected": -0.7982177734375, "step": 570 }, { "epoch": 0.3057459146020032, "grad_norm": 99.50972555071145, "learning_rate": 9.236953083816552e-07, "logits/chosen": 0.840624988079071, "logits/rejected": 0.82275390625, "logps/chosen": -360.6000061035156, "logps/rejected": -338.8999938964844, "loss": 0.5859, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.506103515625, "rewards/margins": 0.7109314203262329, "rewards/rejected": -0.204833984375, "step": 580 }, { "epoch": 0.3110173958882446, "grad_norm": 177.59030074918093, "learning_rate": 9.223774380600948e-07, "logits/chosen": 0.9205077886581421, "logits/rejected": 0.946972668170929, "logps/chosen": -412.6499938964844, "logps/rejected": -376.8999938964844, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": 1.0366699695587158, "rewards/margins": 0.6396239995956421, "rewards/rejected": 0.39710694551467896, "step": 590 }, { "epoch": 0.316288877174486, "grad_norm": 145.6739292111265, "learning_rate": 9.210595677385344e-07, "logits/chosen": 1.0517578125, "logits/rejected": 0.94384765625, "logps/chosen": -425.5, "logps/rejected": -361.75, "loss": 0.6424, "rewards/accuracies": 0.65625, "rewards/chosen": 0.85986328125, "rewards/margins": 0.60986328125, "rewards/rejected": 0.25048828125, "step": 600 }, { "epoch": 0.32156035846072745, "grad_norm": 146.03630423733165, "learning_rate": 9.197416974169741e-07, "logits/chosen": 0.817138671875, "logits/rejected": 0.913983166217804, "logps/chosen": -369.95001220703125, "logps/rejected": -369.54998779296875, "loss": 0.5948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.874560534954071, "rewards/margins": 0.683056652545929, "rewards/rejected": 0.19114990532398224, "step": 610 }, { "epoch": 0.3268318397469689, "grad_norm": 80.68531391966674, "learning_rate": 9.184238270954138e-07, "logits/chosen": 0.8533691167831421, "logits/rejected": 0.8180907964706421, "logps/chosen": -388.3500061035156, "logps/rejected": -357.3999938964844, "loss": 0.6489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5852295160293579, "rewards/margins": 0.802600085735321, "rewards/rejected": -0.21875610947608948, "step": 620 }, { "epoch": 0.33210332103321033, "grad_norm": 133.36903984406126, "learning_rate": 9.171059567738534e-07, "logits/chosen": 0.8172973394393921, "logits/rejected": 0.7413085699081421, "logps/chosen": -382.70001220703125, "logps/rejected": -337.29998779296875, "loss": 0.5377, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.6995849609375, "rewards/margins": 0.802978515625, "rewards/rejected": -0.10360107570886612, "step": 630 }, { "epoch": 0.33737480231945177, "grad_norm": 111.04261268854434, "learning_rate": 9.157880864522931e-07, "logits/chosen": 0.793261706829071, "logits/rejected": 0.841601550579071, "logps/chosen": -393.04998779296875, "logps/rejected": -364.25, "loss": 0.5636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.603710949420929, "rewards/margins": 0.762219250202179, "rewards/rejected": -0.15843506157398224, "step": 640 }, { "epoch": 0.3426462836056932, "grad_norm": 69.69046338416415, "learning_rate": 9.144702161307327e-07, "logits/chosen": 0.78125, "logits/rejected": 0.826171875, "logps/chosen": -336.79998779296875, "logps/rejected": -327.0, "loss": 0.6259, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.4851318299770355, "rewards/margins": 0.7438720464706421, "rewards/rejected": -0.25897216796875, "step": 650 }, { "epoch": 0.34791776489193466, "grad_norm": 95.78439412088628, "learning_rate": 9.131523458091723e-07, "logits/chosen": 0.5985351800918579, "logits/rejected": 0.5453125238418579, "logps/chosen": -391.25, "logps/rejected": -371.1499938964844, "loss": 0.5389, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.672381579875946, "rewards/margins": 1.053808569908142, "rewards/rejected": -0.380340576171875, "step": 660 }, { "epoch": 0.35318924617817604, "grad_norm": 119.5444994091872, "learning_rate": 9.118344754876119e-07, "logits/chosen": 0.74267578125, "logits/rejected": 0.7398742437362671, "logps/chosen": -381.75, "logps/rejected": -347.0, "loss": 0.5787, "rewards/accuracies": 0.65625, "rewards/chosen": 0.23936767876148224, "rewards/margins": 0.805590808391571, "rewards/rejected": -0.5658203363418579, "step": 670 }, { "epoch": 0.3584607274644175, "grad_norm": 185.75507206331235, "learning_rate": 9.105166051660517e-07, "logits/chosen": 0.7226806879043579, "logits/rejected": 0.773486316204071, "logps/chosen": -353.25, "logps/rejected": -328.1499938964844, "loss": 0.641, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.463653564453125, "rewards/margins": 0.689697265625, "rewards/rejected": -0.22600097954273224, "step": 680 }, { "epoch": 0.3637322087506589, "grad_norm": 106.81844716609173, "learning_rate": 9.091987348444913e-07, "logits/chosen": 0.7948242425918579, "logits/rejected": 0.802703857421875, "logps/chosen": -355.8999938964844, "logps/rejected": -356.45001220703125, "loss": 0.5735, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7723633050918579, "rewards/margins": 0.92681884765625, "rewards/rejected": -0.15436401963233948, "step": 690 }, { "epoch": 0.36900369003690037, "grad_norm": 117.19364376956696, "learning_rate": 9.078808645229309e-07, "logits/chosen": 0.863818347454071, "logits/rejected": 0.6862548589706421, "logps/chosen": -343.25, "logps/rejected": -293.1499938964844, "loss": 0.5788, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5865722894668579, "rewards/margins": 0.859179675579071, "rewards/rejected": -0.2713684141635895, "step": 700 }, { "epoch": 0.3742751713231418, "grad_norm": 139.43169993030992, "learning_rate": 9.065629942013705e-07, "logits/chosen": 0.8780273199081421, "logits/rejected": 0.796630859375, "logps/chosen": -371.8500061035156, "logps/rejected": -317.8500061035156, "loss": 0.5325, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7138916254043579, "rewards/margins": 0.84521484375, "rewards/rejected": -0.13181152939796448, "step": 710 }, { "epoch": 0.37954665260938325, "grad_norm": 75.77374339975887, "learning_rate": 9.052451238798102e-07, "logits/chosen": 0.869921863079071, "logits/rejected": 0.891308605670929, "logps/chosen": -375.5, "logps/rejected": -338.45001220703125, "loss": 0.6027, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.6412353515625, "rewards/margins": 0.921582043170929, "rewards/rejected": -0.27996826171875, "step": 720 }, { "epoch": 0.3848181338956247, "grad_norm": 117.26755317727493, "learning_rate": 9.039272535582499e-07, "logits/chosen": 0.859082043170929, "logits/rejected": 0.768310546875, "logps/chosen": -353.25, "logps/rejected": -339.1000061035156, "loss": 0.6195, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5848388671875, "rewards/margins": 0.701953113079071, "rewards/rejected": -0.11754150688648224, "step": 730 }, { "epoch": 0.3900896151818661, "grad_norm": 134.81450233527704, "learning_rate": 9.026093832366895e-07, "logits/chosen": 0.9253906011581421, "logits/rejected": 0.930957019329071, "logps/chosen": -352.70001220703125, "logps/rejected": -342.6000061035156, "loss": 0.6054, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.903076171875, "rewards/margins": 0.7924560308456421, "rewards/rejected": 0.11036987602710724, "step": 740 }, { "epoch": 0.3953610964681075, "grad_norm": 98.8140963800966, "learning_rate": 9.012915129151291e-07, "logits/chosen": 0.827832043170929, "logits/rejected": 0.6888672113418579, "logps/chosen": -367.0, "logps/rejected": -313.95001220703125, "loss": 0.6099, "rewards/accuracies": 0.625, "rewards/chosen": 1.0154297351837158, "rewards/margins": 0.712451159954071, "rewards/rejected": 0.30207520723342896, "step": 750 }, { "epoch": 0.40063257775434896, "grad_norm": 121.32425244055045, "learning_rate": 8.999736425935688e-07, "logits/chosen": 0.98828125, "logits/rejected": 0.8583008050918579, "logps/chosen": -362.8500061035156, "logps/rejected": -312.1000061035156, "loss": 0.6429, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.692187488079071, "rewards/margins": 0.6749267578125, "rewards/rejected": 0.01796874962747097, "step": 760 }, { "epoch": 0.4059040590405904, "grad_norm": 74.78710599178865, "learning_rate": 8.986557722720084e-07, "logits/chosen": 0.8291015625, "logits/rejected": 0.8454223871231079, "logps/chosen": -363.95001220703125, "logps/rejected": -344.79998779296875, "loss": 0.5389, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.740917980670929, "rewards/margins": 1.1161620616912842, "rewards/rejected": -0.37443846464157104, "step": 770 }, { "epoch": 0.41117554032683185, "grad_norm": 129.96848398311934, "learning_rate": 8.97337901950448e-07, "logits/chosen": 0.795117199420929, "logits/rejected": 0.711047351360321, "logps/chosen": -348.6000061035156, "logps/rejected": -317.79998779296875, "loss": 0.6209, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.6340087652206421, "rewards/margins": 0.7481445074081421, "rewards/rejected": -0.11475219577550888, "step": 780 }, { "epoch": 0.4164470216130733, "grad_norm": 92.82581232515204, "learning_rate": 8.960200316288878e-07, "logits/chosen": 0.853808581829071, "logits/rejected": 0.79931640625, "logps/chosen": -389.70001220703125, "logps/rejected": -320.70001220703125, "loss": 0.5619, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.6413329839706421, "rewards/margins": 0.8390136957168579, "rewards/rejected": -0.19691161811351776, "step": 790 }, { "epoch": 0.42171850289931473, "grad_norm": 105.83170814860699, "learning_rate": 8.947021613073274e-07, "logits/chosen": 0.843994140625, "logits/rejected": 0.748730480670929, "logps/chosen": -307.45001220703125, "logps/rejected": -332.6000061035156, "loss": 0.5735, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.6494506597518921, "rewards/margins": 0.858447253704071, "rewards/rejected": -0.20944824814796448, "step": 800 }, { "epoch": 0.4269899841855561, "grad_norm": 105.94062541021403, "learning_rate": 8.93384290985767e-07, "logits/chosen": 0.768994152545929, "logits/rejected": 0.7818359136581421, "logps/chosen": -386.6499938964844, "logps/rejected": -371.79998779296875, "loss": 0.6093, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.41423338651657104, "rewards/margins": 0.885815441608429, "rewards/rejected": -0.47199708223342896, "step": 810 }, { "epoch": 0.43226146547179756, "grad_norm": 118.46157331081271, "learning_rate": 8.920664206642066e-07, "logits/chosen": 0.9300781488418579, "logits/rejected": 0.83349609375, "logps/chosen": -398.25, "logps/rejected": -381.29998779296875, "loss": 0.6703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8050781488418579, "rewards/margins": 0.655957043170929, "rewards/rejected": 0.14935913681983948, "step": 820 }, { "epoch": 0.437532946758039, "grad_norm": 110.32765549746932, "learning_rate": 8.907485503426463e-07, "logits/chosen": 0.74371337890625, "logits/rejected": 0.7235962152481079, "logps/chosen": -400.0, "logps/rejected": -378.79998779296875, "loss": 0.6393, "rewards/accuracies": 0.59375, "rewards/chosen": 0.794726550579071, "rewards/margins": 0.707275390625, "rewards/rejected": 0.08839111030101776, "step": 830 }, { "epoch": 0.44280442804428044, "grad_norm": 122.72064899608436, "learning_rate": 8.894306800210858e-07, "logits/chosen": 0.933837890625, "logits/rejected": 0.814160168170929, "logps/chosen": -361.20001220703125, "logps/rejected": -297.57501220703125, "loss": 0.5465, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8904052972793579, "rewards/margins": 0.9796386957168579, "rewards/rejected": -0.088623046875, "step": 840 }, { "epoch": 0.4480759093305219, "grad_norm": 88.27048717038379, "learning_rate": 8.881128096995255e-07, "logits/chosen": 0.739794909954071, "logits/rejected": 0.816699206829071, "logps/chosen": -368.5, "logps/rejected": -362.70001220703125, "loss": 0.5448, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.1583983898162842, "rewards/margins": 0.8823486566543579, "rewards/rejected": 0.2759643495082855, "step": 850 }, { "epoch": 0.4533473906167633, "grad_norm": 83.39255454823132, "learning_rate": 8.867949393779651e-07, "logits/chosen": 0.920605480670929, "logits/rejected": 0.8426513671875, "logps/chosen": -391.5, "logps/rejected": -344.75, "loss": 0.5761, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.1007812023162842, "rewards/margins": 0.883056640625, "rewards/rejected": 0.21682128310203552, "step": 860 }, { "epoch": 0.45861887190300477, "grad_norm": 191.7240653642726, "learning_rate": 8.854770690564048e-07, "logits/chosen": 0.7909179925918579, "logits/rejected": 0.6896728277206421, "logps/chosen": -366.20001220703125, "logps/rejected": -317.125, "loss": 0.5926, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8317626714706421, "rewards/margins": 0.7711731195449829, "rewards/rejected": 0.06090698391199112, "step": 870 }, { "epoch": 0.46389035318924615, "grad_norm": 110.85571053499041, "learning_rate": 8.841591987348444e-07, "logits/chosen": 0.772753894329071, "logits/rejected": 0.827587902545929, "logps/chosen": -350.95001220703125, "logps/rejected": -375.75, "loss": 0.5115, "rewards/accuracies": 0.71875, "rewards/chosen": 0.79345703125, "rewards/margins": 1.075292944908142, "rewards/rejected": -0.2819457948207855, "step": 880 }, { "epoch": 0.4691618344754876, "grad_norm": 149.71379312399438, "learning_rate": 8.82841328413284e-07, "logits/chosen": 0.818310558795929, "logits/rejected": 0.854052722454071, "logps/chosen": -359.79998779296875, "logps/rejected": -352.70001220703125, "loss": 0.616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5280548334121704, "rewards/margins": 0.8417724370956421, "rewards/rejected": -0.3128112852573395, "step": 890 }, { "epoch": 0.47443331576172904, "grad_norm": 113.0938306870557, "learning_rate": 8.815234580917237e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.7734375, "logps/chosen": -359.0249938964844, "logps/rejected": -303.67498779296875, "loss": 0.5568, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.7337646484375, "rewards/margins": 0.8726562261581421, "rewards/rejected": -0.13798217475414276, "step": 900 }, { "epoch": 0.4797047970479705, "grad_norm": 121.9535732216956, "learning_rate": 8.802055877701634e-07, "logits/chosen": 0.948925793170929, "logits/rejected": 0.895068347454071, "logps/chosen": -385.79998779296875, "logps/rejected": -318.04998779296875, "loss": 0.5137, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.063867211341858, "rewards/margins": 1.082421898841858, "rewards/rejected": -0.017791748046875, "step": 910 }, { "epoch": 0.4849762783342119, "grad_norm": 147.4698746997001, "learning_rate": 8.78887717448603e-07, "logits/chosen": 0.842480480670929, "logits/rejected": 0.952929675579071, "logps/chosen": -375.0, "logps/rejected": -332.6000061035156, "loss": 0.5095, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0359375476837158, "rewards/margins": 1.2773926258087158, "rewards/rejected": -0.24270018935203552, "step": 920 }, { "epoch": 0.49024775962045336, "grad_norm": 147.76915839526515, "learning_rate": 8.775698471270426e-07, "logits/chosen": 0.8736816644668579, "logits/rejected": 0.858642578125, "logps/chosen": -349.70001220703125, "logps/rejected": -323.75, "loss": 0.6868, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.9290527105331421, "rewards/margins": 0.7851318120956421, "rewards/rejected": 0.14472655951976776, "step": 930 }, { "epoch": 0.4955192409066948, "grad_norm": 106.62209261032953, "learning_rate": 8.762519768054823e-07, "logits/chosen": 0.9886718988418579, "logits/rejected": 0.8677734136581421, "logps/chosen": -437.54998779296875, "logps/rejected": -377.1499938964844, "loss": 0.5221, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.058691382408142, "rewards/margins": 1.098242163658142, "rewards/rejected": -0.03943786770105362, "step": 940 }, { "epoch": 0.5007907221929362, "grad_norm": 85.36054329839537, "learning_rate": 8.749341064839219e-07, "logits/chosen": 0.751660168170929, "logits/rejected": 0.754443347454071, "logps/chosen": -357.6000061035156, "logps/rejected": -371.95001220703125, "loss": 0.619, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.779980480670929, "rewards/margins": 0.9005066156387329, "rewards/rejected": -0.12051086127758026, "step": 950 }, { "epoch": 0.5060622034791776, "grad_norm": 124.0095139137979, "learning_rate": 8.736162361623616e-07, "logits/chosen": 0.7721191644668579, "logits/rejected": 0.741259753704071, "logps/chosen": -334.1499938964844, "logps/rejected": -330.1000061035156, "loss": 0.6456, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3148437440395355, "rewards/margins": 0.7193847894668579, "rewards/rejected": -0.4045257568359375, "step": 960 }, { "epoch": 0.5113336847654191, "grad_norm": 114.08657860786839, "learning_rate": 8.722983658408012e-07, "logits/chosen": 0.6832031011581421, "logits/rejected": 0.622851550579071, "logps/chosen": -393.54998779296875, "logps/rejected": -379.1499938964844, "loss": 0.599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5779052972793579, "rewards/margins": 0.8212890625, "rewards/rejected": -0.24350586533546448, "step": 970 }, { "epoch": 0.5166051660516605, "grad_norm": 107.1147962588611, "learning_rate": 8.709804955192409e-07, "logits/chosen": 0.716992199420929, "logits/rejected": 0.629931628704071, "logps/chosen": -323.45001220703125, "logps/rejected": -309.6499938964844, "loss": 0.5249, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8935546875, "rewards/margins": 0.9620116949081421, "rewards/rejected": -0.06785888969898224, "step": 980 }, { "epoch": 0.521876647337902, "grad_norm": 101.58189506623181, "learning_rate": 8.696626251976805e-07, "logits/chosen": 0.64404296875, "logits/rejected": 0.7242676019668579, "logps/chosen": -356.8999938964844, "logps/rejected": -358.1499938964844, "loss": 0.6026, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.003808617591858, "rewards/margins": 0.868969738483429, "rewards/rejected": 0.13547363877296448, "step": 990 }, { "epoch": 0.5271481286241434, "grad_norm": 104.62900076642923, "learning_rate": 8.683447548761201e-07, "logits/chosen": 0.7349609136581421, "logits/rejected": 0.7767089605331421, "logps/chosen": -310.70001220703125, "logps/rejected": -308.45001220703125, "loss": 0.6168, "rewards/accuracies": 0.65625, "rewards/chosen": 0.851367175579071, "rewards/margins": 0.687939465045929, "rewards/rejected": 0.16334839165210724, "step": 1000 }, { "epoch": 0.5324196099103848, "grad_norm": 137.13749922936813, "learning_rate": 8.670268845545597e-07, "logits/chosen": 0.871826171875, "logits/rejected": 0.8369140625, "logps/chosen": -334.25, "logps/rejected": -303.875, "loss": 0.4974, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9669433832168579, "rewards/margins": 1.1200072765350342, "rewards/rejected": -0.15253905951976776, "step": 1010 }, { "epoch": 0.5376910911966263, "grad_norm": 110.92515149631303, "learning_rate": 8.657090142329995e-07, "logits/chosen": 0.80517578125, "logits/rejected": 0.698925793170929, "logps/chosen": -379.3500061035156, "logps/rejected": -355.25, "loss": 0.5682, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.160546898841858, "rewards/margins": 0.943896472454071, "rewards/rejected": 0.21710205078125, "step": 1020 }, { "epoch": 0.5429625724828677, "grad_norm": 93.86139209269648, "learning_rate": 8.643911439114391e-07, "logits/chosen": 0.9281250238418579, "logits/rejected": 0.9583984613418579, "logps/chosen": -385.8999938964844, "logps/rejected": -350.3999938964844, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1974608898162842, "rewards/margins": 1.043701171875, "rewards/rejected": 0.15456542372703552, "step": 1030 }, { "epoch": 0.5482340537691092, "grad_norm": 166.76735730507687, "learning_rate": 8.630732735898787e-07, "logits/chosen": 0.841601550579071, "logits/rejected": 0.7579101324081421, "logps/chosen": -388.0, "logps/rejected": -346.45001220703125, "loss": 0.6157, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0901367664337158, "rewards/margins": 0.9604736566543579, "rewards/rejected": 0.12910155951976776, "step": 1040 }, { "epoch": 0.5535055350553506, "grad_norm": 136.9488157507124, "learning_rate": 8.617554032683183e-07, "logits/chosen": 0.7435058355331421, "logits/rejected": 0.68328857421875, "logps/chosen": -365.1000061035156, "logps/rejected": -330.70001220703125, "loss": 0.5957, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.609423816204071, "rewards/margins": 0.9117187261581421, "rewards/rejected": -0.3024658262729645, "step": 1050 }, { "epoch": 0.5587770163415919, "grad_norm": 86.4897898261502, "learning_rate": 8.60437532946758e-07, "logits/chosen": 0.7562011480331421, "logits/rejected": 0.709277331829071, "logps/chosen": -369.1499938964844, "logps/rejected": -350.25, "loss": 0.5186, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8750976324081421, "rewards/margins": 0.9781249761581421, "rewards/rejected": -0.10306396335363388, "step": 1060 }, { "epoch": 0.5640484976278334, "grad_norm": 120.04422155267943, "learning_rate": 8.591196626251977e-07, "logits/chosen": 0.7899414300918579, "logits/rejected": 0.725756824016571, "logps/chosen": -336.79998779296875, "logps/rejected": -310.7749938964844, "loss": 0.5918, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7805694341659546, "rewards/margins": 0.8656982183456421, "rewards/rejected": -0.08481445163488388, "step": 1070 }, { "epoch": 0.5693199789140748, "grad_norm": 125.65257045694351, "learning_rate": 8.578017923036373e-07, "logits/chosen": 0.8425048589706421, "logits/rejected": 0.778594970703125, "logps/chosen": -335.1000061035156, "logps/rejected": -305.54998779296875, "loss": 0.5964, "rewards/accuracies": 0.65625, "rewards/chosen": 0.839813232421875, "rewards/margins": 0.878173828125, "rewards/rejected": -0.0389404296875, "step": 1080 }, { "epoch": 0.5745914602003163, "grad_norm": 65.6145640296093, "learning_rate": 8.56483921982077e-07, "logits/chosen": 0.9930664300918579, "logits/rejected": 0.878613293170929, "logps/chosen": -312.79998779296875, "logps/rejected": -308.8500061035156, "loss": 0.5242, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1123535633087158, "rewards/margins": 1.0439453125, "rewards/rejected": 0.06782226264476776, "step": 1090 }, { "epoch": 0.5798629414865577, "grad_norm": 139.05411692042128, "learning_rate": 8.551660516605166e-07, "logits/chosen": 0.9052734375, "logits/rejected": 0.826464831829071, "logps/chosen": -386.20001220703125, "logps/rejected": -361.95001220703125, "loss": 0.5704, "rewards/accuracies": 0.6875, "rewards/chosen": 0.84716796875, "rewards/margins": 1.004492163658142, "rewards/rejected": -0.15775756537914276, "step": 1100 }, { "epoch": 0.5851344227727991, "grad_norm": 112.3666580343241, "learning_rate": 8.538481813389562e-07, "logits/chosen": 0.778393566608429, "logits/rejected": 0.7120116949081421, "logps/chosen": -341.45001220703125, "logps/rejected": -330.75, "loss": 0.5755, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.5264892578125, "rewards/margins": 0.936767578125, "rewards/rejected": -0.40966796875, "step": 1110 }, { "epoch": 0.5904059040590406, "grad_norm": 128.10317047075105, "learning_rate": 8.525303110173958e-07, "logits/chosen": 0.932421863079071, "logits/rejected": 0.797656238079071, "logps/chosen": -379.04998779296875, "logps/rejected": -356.8500061035156, "loss": 0.5265, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.660595715045929, "rewards/margins": 1.1353027820587158, "rewards/rejected": -0.47368162870407104, "step": 1120 }, { "epoch": 0.595677385345282, "grad_norm": 137.63166371722642, "learning_rate": 8.512124406958356e-07, "logits/chosen": 0.8207031488418579, "logits/rejected": 0.8076171875, "logps/chosen": -347.79998779296875, "logps/rejected": -318.0, "loss": 0.609, "rewards/accuracies": 0.71875, "rewards/chosen": 0.82720947265625, "rewards/margins": 1.057470679283142, "rewards/rejected": -0.228973388671875, "step": 1130 }, { "epoch": 0.6009488666315235, "grad_norm": 192.50257618758772, "learning_rate": 8.498945703742752e-07, "logits/chosen": 0.825146496295929, "logits/rejected": 0.757843017578125, "logps/chosen": -375.04998779296875, "logps/rejected": -385.6000061035156, "loss": 0.6334, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.8773437738418579, "rewards/margins": 0.786865234375, "rewards/rejected": 0.09139404445886612, "step": 1140 }, { "epoch": 0.6062203479177649, "grad_norm": 91.48106531526288, "learning_rate": 8.485767000527148e-07, "logits/chosen": 0.731689453125, "logits/rejected": 0.8008788824081421, "logps/chosen": -330.3500061035156, "logps/rejected": -336.8999938964844, "loss": 0.599, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.926928699016571, "rewards/margins": 0.944287121295929, "rewards/rejected": -0.01699218712747097, "step": 1150 }, { "epoch": 0.6114918292040064, "grad_norm": 180.7170945806914, "learning_rate": 8.472588297311544e-07, "logits/chosen": 0.717602550983429, "logits/rejected": 0.693554699420929, "logps/chosen": -382.5, "logps/rejected": -370.1499938964844, "loss": 0.6574, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.76416015625, "rewards/margins": 0.6677795648574829, "rewards/rejected": 0.09633789211511612, "step": 1160 }, { "epoch": 0.6167633104902478, "grad_norm": 139.87481879698288, "learning_rate": 8.459409594095941e-07, "logits/chosen": 0.77783203125, "logits/rejected": 0.743725597858429, "logps/chosen": -312.45001220703125, "logps/rejected": -298.70001220703125, "loss": 0.6648, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.687695324420929, "rewards/margins": 0.6653076410293579, "rewards/rejected": 0.02211914025247097, "step": 1170 }, { "epoch": 0.6220347917764892, "grad_norm": 90.35672154700887, "learning_rate": 8.446230890880337e-07, "logits/chosen": 0.73779296875, "logits/rejected": 0.7499023675918579, "logps/chosen": -382.0, "logps/rejected": -391.95001220703125, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.900585949420929, "rewards/margins": 1.1053955554962158, "rewards/rejected": -0.20513916015625, "step": 1180 }, { "epoch": 0.6273062730627307, "grad_norm": 123.70285677613938, "learning_rate": 8.433052187664734e-07, "logits/chosen": 0.651684582233429, "logits/rejected": 0.782910168170929, "logps/chosen": -341.3500061035156, "logps/rejected": -334.95001220703125, "loss": 0.5463, "rewards/accuracies": 0.71875, "rewards/chosen": 0.923632800579071, "rewards/margins": 1.1437499523162842, "rewards/rejected": -0.21921996772289276, "step": 1190 }, { "epoch": 0.632577754348972, "grad_norm": 96.13377461642783, "learning_rate": 8.419873484449131e-07, "logits/chosen": 0.797167956829071, "logits/rejected": 0.7704101800918579, "logps/chosen": -366.3999938964844, "logps/rejected": -337.8500061035156, "loss": 0.6041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.854052722454071, "rewards/margins": 0.988354504108429, "rewards/rejected": -0.13374023139476776, "step": 1200 }, { "epoch": 0.6378492356352135, "grad_norm": 169.6507832892545, "learning_rate": 8.406694781233526e-07, "logits/chosen": 0.7926269769668579, "logits/rejected": 0.6640259027481079, "logps/chosen": -368.1000061035156, "logps/rejected": -339.29998779296875, "loss": 0.6705, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.963549792766571, "rewards/margins": 0.8261474370956421, "rewards/rejected": 0.13768310844898224, "step": 1210 }, { "epoch": 0.6431207169214549, "grad_norm": 86.18822411473002, "learning_rate": 8.393516078017922e-07, "logits/chosen": 0.763671875, "logits/rejected": 0.79931640625, "logps/chosen": -382.6499938964844, "logps/rejected": -349.54998779296875, "loss": 0.7456, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.0504639148712158, "rewards/margins": 0.68206787109375, "rewards/rejected": 0.367401123046875, "step": 1220 }, { "epoch": 0.6483921982076963, "grad_norm": 104.44532993184471, "learning_rate": 8.380337374802318e-07, "logits/chosen": 0.7570556402206421, "logits/rejected": 0.706188976764679, "logps/chosen": -364.6000061035156, "logps/rejected": -360.70001220703125, "loss": 0.586, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.9112304449081421, "rewards/margins": 0.786938488483429, "rewards/rejected": 0.12363891303539276, "step": 1230 }, { "epoch": 0.6536636794939378, "grad_norm": 78.92512088014344, "learning_rate": 8.367158671586716e-07, "logits/chosen": 0.46014404296875, "logits/rejected": 0.48101806640625, "logps/chosen": -415.6000061035156, "logps/rejected": -366.1000061035156, "loss": 0.558, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.48173826932907104, "rewards/margins": 0.9697265625, "rewards/rejected": -0.48759764432907104, "step": 1240 }, { "epoch": 0.6589351607801792, "grad_norm": 123.64849048066559, "learning_rate": 8.353979968371112e-07, "logits/chosen": 0.699462890625, "logits/rejected": 0.701403796672821, "logps/chosen": -337.29998779296875, "logps/rejected": -346.54998779296875, "loss": 0.6682, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5520995855331421, "rewards/margins": 0.81219482421875, "rewards/rejected": -0.2600463926792145, "step": 1250 }, { "epoch": 0.6642066420664207, "grad_norm": 170.55278056735227, "learning_rate": 8.340801265155508e-07, "logits/chosen": 0.53033447265625, "logits/rejected": 0.49702149629592896, "logps/chosen": -365.79998779296875, "logps/rejected": -338.3999938964844, "loss": 0.6357, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.8619629144668579, "rewards/margins": 0.7878662347793579, "rewards/rejected": 0.07451782375574112, "step": 1260 }, { "epoch": 0.6694781233526621, "grad_norm": 109.80235768499024, "learning_rate": 8.327622561939904e-07, "logits/chosen": 0.941943347454071, "logits/rejected": 0.9500976800918579, "logps/chosen": -322.29998779296875, "logps/rejected": -295.04998779296875, "loss": 0.6343, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.036950707435608, "rewards/margins": 0.7531493902206421, "rewards/rejected": 0.2836669981479645, "step": 1270 }, { "epoch": 0.6747496046389035, "grad_norm": 108.74784079246557, "learning_rate": 8.314443858724301e-07, "logits/chosen": 0.888476550579071, "logits/rejected": 0.785449206829071, "logps/chosen": -345.20001220703125, "logps/rejected": -325.0, "loss": 0.6343, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.114843726158142, "rewards/margins": 0.713061511516571, "rewards/rejected": 0.40180665254592896, "step": 1280 }, { "epoch": 0.680021085925145, "grad_norm": 133.12273729071507, "learning_rate": 8.301265155508697e-07, "logits/chosen": 0.9930664300918579, "logits/rejected": 1.0046875476837158, "logps/chosen": -344.75, "logps/rejected": -328.8999938964844, "loss": 0.638, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.2427856922149658, "rewards/margins": 0.795458972454071, "rewards/rejected": 0.4483886659145355, "step": 1290 }, { "epoch": 0.6852925672113864, "grad_norm": 118.0196521852784, "learning_rate": 8.288086452293094e-07, "logits/chosen": 0.8495117425918579, "logits/rejected": 0.7803710699081421, "logps/chosen": -337.1000061035156, "logps/rejected": -328.95001220703125, "loss": 0.5901, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.5236327648162842, "rewards/margins": 0.8613647222518921, "rewards/rejected": 0.66033935546875, "step": 1300 }, { "epoch": 0.6905640484976279, "grad_norm": 127.08903159931943, "learning_rate": 8.27490774907749e-07, "logits/chosen": 0.9107421636581421, "logits/rejected": 0.871386706829071, "logps/chosen": -428.95001220703125, "logps/rejected": -358.29998779296875, "loss": 0.5428, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.4412109851837158, "rewards/margins": 1.080908179283142, "rewards/rejected": 0.35980528593063354, "step": 1310 }, { "epoch": 0.6958355297838693, "grad_norm": 143.43451929576835, "learning_rate": 8.261729045861887e-07, "logits/chosen": 1.0053222179412842, "logits/rejected": 0.873730480670929, "logps/chosen": -376.54998779296875, "logps/rejected": -341.8999938964844, "loss": 0.5494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.627539038658142, "rewards/margins": 0.9886230230331421, "rewards/rejected": 0.638867199420929, "step": 1320 }, { "epoch": 0.7011070110701108, "grad_norm": 116.76275025692195, "learning_rate": 8.248550342646283e-07, "logits/chosen": 1.038476586341858, "logits/rejected": 0.904833972454071, "logps/chosen": -383.1499938964844, "logps/rejected": -342.0, "loss": 0.4875, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.641015648841858, "rewards/margins": 1.2585937976837158, "rewards/rejected": 0.38179320096969604, "step": 1330 }, { "epoch": 0.7063784923563521, "grad_norm": 108.43759803885371, "learning_rate": 8.235371639430679e-07, "logits/chosen": 0.931640625, "logits/rejected": 0.850878894329071, "logps/chosen": -399.20001220703125, "logps/rejected": -354.1499938964844, "loss": 0.4845, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.416601538658142, "rewards/margins": 1.276525855064392, "rewards/rejected": 0.14007568359375, "step": 1340 }, { "epoch": 0.7116499736425935, "grad_norm": 187.52910081418727, "learning_rate": 8.222192936215076e-07, "logits/chosen": 0.864697277545929, "logits/rejected": 0.7901855707168579, "logps/chosen": -369.1499938964844, "logps/rejected": -344.20001220703125, "loss": 0.7039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.045996069908142, "rewards/margins": 0.672656238079071, "rewards/rejected": 0.37373048067092896, "step": 1350 }, { "epoch": 0.716921454928835, "grad_norm": 128.78290345325246, "learning_rate": 8.209014232999473e-07, "logits/chosen": 0.848583996295929, "logits/rejected": 0.757275402545929, "logps/chosen": -369.5, "logps/rejected": -325.04998779296875, "loss": 0.6771, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.231689453125, "rewards/margins": 0.6773010492324829, "rewards/rejected": 0.5566650629043579, "step": 1360 }, { "epoch": 0.7221929362150764, "grad_norm": 139.33177306357362, "learning_rate": 8.195835529783869e-07, "logits/chosen": 0.874755859375, "logits/rejected": 0.8018554449081421, "logps/chosen": -337.3500061035156, "logps/rejected": -350.3500061035156, "loss": 0.5619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.16650390625, "rewards/margins": 0.972412109375, "rewards/rejected": 0.19541016221046448, "step": 1370 }, { "epoch": 0.7274644175013179, "grad_norm": 98.1760756584025, "learning_rate": 8.182656826568265e-07, "logits/chosen": 0.811816394329071, "logits/rejected": 0.7837890386581421, "logps/chosen": -381.29998779296875, "logps/rejected": -328.20001220703125, "loss": 0.5789, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.148828148841858, "rewards/margins": 0.997998058795929, "rewards/rejected": 0.15051880478858948, "step": 1380 }, { "epoch": 0.7327358987875593, "grad_norm": 114.14355980659691, "learning_rate": 8.169478123352662e-07, "logits/chosen": 0.825146496295929, "logits/rejected": 0.7448180913925171, "logps/chosen": -340.5, "logps/rejected": -339.20001220703125, "loss": 0.5052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.4500000476837158, "rewards/margins": 1.0955078601837158, "rewards/rejected": 0.35406494140625, "step": 1390 }, { "epoch": 0.7380073800738007, "grad_norm": 111.66633826732001, "learning_rate": 8.156299420137058e-07, "logits/chosen": 0.7704101800918579, "logits/rejected": 0.684887707233429, "logps/chosen": -351.3500061035156, "logps/rejected": -314.3500061035156, "loss": 0.581, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0868651866912842, "rewards/margins": 0.7303711175918579, "rewards/rejected": 0.3573364317417145, "step": 1400 }, { "epoch": 0.7432788613600422, "grad_norm": 104.79763622542778, "learning_rate": 8.143120716921455e-07, "logits/chosen": 0.916210949420929, "logits/rejected": 0.808154284954071, "logps/chosen": -391.70001220703125, "logps/rejected": -335.3500061035156, "loss": 0.5737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.4597656726837158, "rewards/margins": 0.8645874261856079, "rewards/rejected": 0.594531238079071, "step": 1410 }, { "epoch": 0.7485503426462836, "grad_norm": 167.8548334172145, "learning_rate": 8.129942013705851e-07, "logits/chosen": 0.9144531488418579, "logits/rejected": 0.794726550579071, "logps/chosen": -374.54998779296875, "logps/rejected": -348.54998779296875, "loss": 0.6543, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.104638695716858, "rewards/margins": 0.99151611328125, "rewards/rejected": 0.11279296875, "step": 1420 }, { "epoch": 0.7538218239325251, "grad_norm": 160.73797411928683, "learning_rate": 8.116763310490248e-07, "logits/chosen": 0.91357421875, "logits/rejected": 0.925488293170929, "logps/chosen": -349.29998779296875, "logps/rejected": -369.0, "loss": 0.6409, "rewards/accuracies": 0.65625, "rewards/chosen": 1.099023461341858, "rewards/margins": 0.804003894329071, "rewards/rejected": 0.2935546934604645, "step": 1430 }, { "epoch": 0.7590933052187665, "grad_norm": 86.71130747052099, "learning_rate": 8.103584607274644e-07, "logits/chosen": 0.78955078125, "logits/rejected": 0.853808581829071, "logps/chosen": -357.20001220703125, "logps/rejected": -307.70001220703125, "loss": 0.6249, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.931347668170929, "rewards/margins": 0.702465832233429, "rewards/rejected": 0.22984619438648224, "step": 1440 }, { "epoch": 0.7643647865050079, "grad_norm": 82.7768042097966, "learning_rate": 8.09040590405904e-07, "logits/chosen": 0.896289050579071, "logits/rejected": 0.90625, "logps/chosen": -368.3500061035156, "logps/rejected": -341.8500061035156, "loss": 0.5786, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.153222680091858, "rewards/margins": 0.9783691167831421, "rewards/rejected": 0.17453613877296448, "step": 1450 }, { "epoch": 0.7696362677912494, "grad_norm": 112.97289021411883, "learning_rate": 8.077227200843436e-07, "logits/chosen": 0.802783191204071, "logits/rejected": 0.6663818359375, "logps/chosen": -379.1499938964844, "logps/rejected": -331.8500061035156, "loss": 0.6534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.7684326171875, "rewards/margins": 0.8141113519668579, "rewards/rejected": -0.04532470554113388, "step": 1460 }, { "epoch": 0.7749077490774908, "grad_norm": 126.23820881980976, "learning_rate": 8.064048497627834e-07, "logits/chosen": 0.8667968511581421, "logits/rejected": 0.943652331829071, "logps/chosen": -329.3999938964844, "logps/rejected": -328.42498779296875, "loss": 0.596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.970898449420929, "rewards/margins": 0.856249988079071, "rewards/rejected": 0.11494140326976776, "step": 1470 }, { "epoch": 0.7801792303637322, "grad_norm": 138.81234491311616, "learning_rate": 8.05086979441223e-07, "logits/chosen": 0.8167968988418579, "logits/rejected": 0.836132824420929, "logps/chosen": -334.6499938964844, "logps/rejected": -314.1499938964844, "loss": 0.6246, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.764892578125, "rewards/margins": 0.6267455816268921, "rewards/rejected": 0.13708190619945526, "step": 1480 }, { "epoch": 0.7854507116499736, "grad_norm": 85.51990331318396, "learning_rate": 8.037691091196626e-07, "logits/chosen": 0.7958984375, "logits/rejected": 0.8262695074081421, "logps/chosen": -351.1499938964844, "logps/rejected": -328.8999938964844, "loss": 0.4454, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.792742908000946, "rewards/margins": 1.2001953125, "rewards/rejected": -0.40800780057907104, "step": 1490 }, { "epoch": 0.790722192936215, "grad_norm": 99.30257290721815, "learning_rate": 8.024512387981023e-07, "logits/chosen": 0.6227051019668579, "logits/rejected": 0.6551758050918579, "logps/chosen": -382.45001220703125, "logps/rejected": -386.8500061035156, "loss": 0.6011, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2763915956020355, "rewards/margins": 1.0065429210662842, "rewards/rejected": -0.7293945550918579, "step": 1500 }, { "epoch": 0.7959936742224565, "grad_norm": 115.59334159723743, "learning_rate": 8.011333684765419e-07, "logits/chosen": 0.668872058391571, "logits/rejected": 0.705432116985321, "logps/chosen": -333.1000061035156, "logps/rejected": -323.3999938964844, "loss": 0.5832, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.480224609375, "rewards/margins": 1.0009276866912842, "rewards/rejected": -0.5205444097518921, "step": 1510 }, { "epoch": 0.8012651555086979, "grad_norm": 94.59853895622598, "learning_rate": 7.998154981549815e-07, "logits/chosen": 0.7259277105331421, "logits/rejected": 0.7291015386581421, "logps/chosen": -393.0, "logps/rejected": -344.3500061035156, "loss": 0.5443, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.688830554485321, "rewards/margins": 1.1785094738006592, "rewards/rejected": -0.4893798828125, "step": 1520 }, { "epoch": 0.8065366367949394, "grad_norm": 134.08613019626998, "learning_rate": 7.984976278334212e-07, "logits/chosen": 0.8346923589706421, "logits/rejected": 0.6591552495956421, "logps/chosen": -369.5, "logps/rejected": -320.5, "loss": 0.5369, "rewards/accuracies": 0.71875, "rewards/chosen": 1.038476586341858, "rewards/margins": 1.039697289466858, "rewards/rejected": -0.0016357421409338713, "step": 1530 }, { "epoch": 0.8118081180811808, "grad_norm": 108.74692194095388, "learning_rate": 7.971797575118609e-07, "logits/chosen": 0.851855456829071, "logits/rejected": 0.8487793207168579, "logps/chosen": -328.25, "logps/rejected": -294.8999938964844, "loss": 0.6109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.870007336139679, "rewards/margins": 0.827392578125, "rewards/rejected": 0.04246826097369194, "step": 1540 }, { "epoch": 0.8170795993674222, "grad_norm": 89.06901780483855, "learning_rate": 7.958618871903005e-07, "logits/chosen": 0.805957019329071, "logits/rejected": 0.7738281488418579, "logps/chosen": -315.5249938964844, "logps/rejected": -313.95001220703125, "loss": 0.5959, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.641894519329071, "rewards/margins": 0.819628894329071, "rewards/rejected": -0.17749634385108948, "step": 1550 }, { "epoch": 0.8223510806536637, "grad_norm": 114.09944885247121, "learning_rate": 7.9454401686874e-07, "logits/chosen": 0.8194335699081421, "logits/rejected": 0.7961181402206421, "logps/chosen": -355.54998779296875, "logps/rejected": -336.0, "loss": 0.6087, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8164306879043579, "rewards/margins": 0.856982409954071, "rewards/rejected": -0.04042358323931694, "step": 1560 }, { "epoch": 0.8276225619399051, "grad_norm": 111.34837097852406, "learning_rate": 7.932261465471796e-07, "logits/chosen": 0.6976073980331421, "logits/rejected": 0.635357677936554, "logps/chosen": -384.3999938964844, "logps/rejected": -351.79998779296875, "loss": 0.587, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7982422113418579, "rewards/margins": 0.9393554925918579, "rewards/rejected": -0.14197997748851776, "step": 1570 }, { "epoch": 0.8328940432261466, "grad_norm": 130.9579708108328, "learning_rate": 7.919082762256194e-07, "logits/chosen": 0.7212890386581421, "logits/rejected": 0.8905273675918579, "logps/chosen": -311.1499938964844, "logps/rejected": -288.45001220703125, "loss": 0.5928, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.609423816204071, "rewards/margins": 0.71240234375, "rewards/rejected": -0.10261841118335724, "step": 1580 }, { "epoch": 0.838165524512388, "grad_norm": 109.84950693681843, "learning_rate": 7.90590405904059e-07, "logits/chosen": 0.777294933795929, "logits/rejected": 0.775634765625, "logps/chosen": -358.79998779296875, "logps/rejected": -314.6000061035156, "loss": 0.6692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6836181879043579, "rewards/margins": 0.679492175579071, "rewards/rejected": 0.0039276122115552425, "step": 1590 }, { "epoch": 0.8434370057986295, "grad_norm": 140.37633725578377, "learning_rate": 7.892725355824986e-07, "logits/chosen": 0.8824218511581421, "logits/rejected": 0.8202148675918579, "logps/chosen": -354.3999938964844, "logps/rejected": -334.42498779296875, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.739550769329071, "rewards/margins": 0.878222644329071, "rewards/rejected": -0.13889160752296448, "step": 1600 }, { "epoch": 0.8487084870848709, "grad_norm": 91.6693415542738, "learning_rate": 7.879546652609382e-07, "logits/chosen": 0.9189453125, "logits/rejected": 0.78759765625, "logps/chosen": -365.25, "logps/rejected": -347.6499938964844, "loss": 0.4655, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.0216796398162842, "rewards/margins": 1.216210961341858, "rewards/rejected": -0.19500732421875, "step": 1610 }, { "epoch": 0.8539799683711122, "grad_norm": 108.6916536150076, "learning_rate": 7.866367949393779e-07, "logits/chosen": 0.635058581829071, "logits/rejected": 0.684741199016571, "logps/chosen": -389.0, "logps/rejected": -372.0, "loss": 0.4636, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.5975097417831421, "rewards/margins": 1.1671874523162842, "rewards/rejected": -0.5706268548965454, "step": 1620 }, { "epoch": 0.8592514496573537, "grad_norm": 156.99671277894905, "learning_rate": 7.853189246178175e-07, "logits/chosen": 0.6380859613418579, "logits/rejected": 0.6093994379043579, "logps/chosen": -365.8999938964844, "logps/rejected": -348.95001220703125, "loss": 0.6216, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.05121154710650444, "rewards/margins": 0.903735339641571, "rewards/rejected": -0.955078125, "step": 1630 }, { "epoch": 0.8645229309435951, "grad_norm": 127.53465113040099, "learning_rate": 7.840010542962572e-07, "logits/chosen": 0.540484607219696, "logits/rejected": 0.526074230670929, "logps/chosen": -382.6000061035156, "logps/rejected": -361.5, "loss": 0.5559, "rewards/accuracies": 0.71875, "rewards/chosen": 0.263510137796402, "rewards/margins": 0.89599609375, "rewards/rejected": -0.6319335699081421, "step": 1640 }, { "epoch": 0.8697944122298366, "grad_norm": 116.92119496061069, "learning_rate": 7.826831839746969e-07, "logits/chosen": 0.6388183832168579, "logits/rejected": 0.5826660394668579, "logps/chosen": -348.75, "logps/rejected": -349.29998779296875, "loss": 0.4491, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.5229247808456421, "rewards/margins": 1.234375, "rewards/rejected": -0.711132824420929, "step": 1650 }, { "epoch": 0.875065893516078, "grad_norm": 104.09918530654777, "learning_rate": 7.813653136531365e-07, "logits/chosen": 0.62255859375, "logits/rejected": 0.594409167766571, "logps/chosen": -369.82501220703125, "logps/rejected": -355.8500061035156, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": 0.38847655057907104, "rewards/margins": 1.2305908203125, "rewards/rejected": -0.8421875238418579, "step": 1660 }, { "epoch": 0.8803373748023194, "grad_norm": 149.06738129873887, "learning_rate": 7.800474433315761e-07, "logits/chosen": 0.5888427495956421, "logits/rejected": 0.609204113483429, "logps/chosen": -364.3999938964844, "logps/rejected": -341.29998779296875, "loss": 0.5593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26237791776657104, "rewards/margins": 1.1411621570587158, "rewards/rejected": -0.8786255121231079, "step": 1670 }, { "epoch": 0.8856088560885609, "grad_norm": 113.42289262094938, "learning_rate": 7.787295730100157e-07, "logits/chosen": 0.713763415813446, "logits/rejected": 0.650195300579071, "logps/chosen": -359.95001220703125, "logps/rejected": -341.04998779296875, "loss": 0.5737, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3160644471645355, "rewards/margins": 0.98876953125, "rewards/rejected": -0.672656238079071, "step": 1680 }, { "epoch": 0.8908803373748023, "grad_norm": 128.80180757612445, "learning_rate": 7.774117026884554e-07, "logits/chosen": 0.759783923625946, "logits/rejected": 0.7034667730331421, "logps/chosen": -374.1000061035156, "logps/rejected": -344.0, "loss": 0.539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7488769292831421, "rewards/margins": 1.1764404773712158, "rewards/rejected": -0.4284301698207855, "step": 1690 }, { "epoch": 0.8961518186610438, "grad_norm": 65.79583880475073, "learning_rate": 7.760938323668951e-07, "logits/chosen": 0.8646484613418579, "logits/rejected": 0.815478503704071, "logps/chosen": -365.75, "logps/rejected": -329.8999938964844, "loss": 0.6777, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.4588623046875, "rewards/margins": 0.8236328363418579, "rewards/rejected": -0.36512452363967896, "step": 1700 }, { "epoch": 0.9014232999472852, "grad_norm": 122.81987878832749, "learning_rate": 7.747759620453347e-07, "logits/chosen": 0.773303210735321, "logits/rejected": 0.814892590045929, "logps/chosen": -369.70001220703125, "logps/rejected": -375.3999938964844, "loss": 0.5372, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.708056628704071, "rewards/margins": 1.077661156654358, "rewards/rejected": -0.369223028421402, "step": 1710 }, { "epoch": 0.9066947812335266, "grad_norm": 97.0086407062235, "learning_rate": 7.734580917237743e-07, "logits/chosen": 0.7855468988418579, "logits/rejected": 0.709277331829071, "logps/chosen": -389.95001220703125, "logps/rejected": -356.45001220703125, "loss": 0.5606, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.664306640625, "rewards/margins": 1.009179711341858, "rewards/rejected": -0.3464599549770355, "step": 1720 }, { "epoch": 0.9119662625197681, "grad_norm": 78.26357788022224, "learning_rate": 7.72140221402214e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.704541027545929, "logps/chosen": -412.95001220703125, "logps/rejected": -356.1000061035156, "loss": 0.4833, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5758301019668579, "rewards/margins": 1.2136719226837158, "rewards/rejected": -0.638104259967804, "step": 1730 }, { "epoch": 0.9172377438060095, "grad_norm": 124.2470413075545, "learning_rate": 7.708223510806536e-07, "logits/chosen": 0.664746105670929, "logits/rejected": 0.73583984375, "logps/chosen": -379.25, "logps/rejected": -337.5, "loss": 0.5612, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.566876232624054, "rewards/margins": 0.9468994140625, "rewards/rejected": -0.3807128965854645, "step": 1740 }, { "epoch": 0.922509225092251, "grad_norm": 124.38014701630173, "learning_rate": 7.695044807590932e-07, "logits/chosen": 0.710400402545929, "logits/rejected": 0.8827148675918579, "logps/chosen": -357.1000061035156, "logps/rejected": -362.29998779296875, "loss": 0.6094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.40185546875, "rewards/margins": 0.7494140863418579, "rewards/rejected": -0.34735107421875, "step": 1750 }, { "epoch": 0.9277807063784923, "grad_norm": 134.42291514619455, "learning_rate": 7.68186610437533e-07, "logits/chosen": 0.824267566204071, "logits/rejected": 0.8482421636581421, "logps/chosen": -389.1499938964844, "logps/rejected": -349.20001220703125, "loss": 0.6119, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5218750238418579, "rewards/margins": 0.90380859375, "rewards/rejected": -0.38096922636032104, "step": 1760 }, { "epoch": 0.9330521876647337, "grad_norm": 122.27327494758963, "learning_rate": 7.668687401159726e-07, "logits/chosen": 0.758105456829071, "logits/rejected": 0.764941394329071, "logps/chosen": -387.04998779296875, "logps/rejected": -355.79998779296875, "loss": 0.6908, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4339355528354645, "rewards/margins": 0.6908203363418579, "rewards/rejected": -0.25676268339157104, "step": 1770 }, { "epoch": 0.9383236689509752, "grad_norm": 192.34796297591112, "learning_rate": 7.655508697944122e-07, "logits/chosen": 0.9046875238418579, "logits/rejected": 0.7919921875, "logps/chosen": -429.1499938964844, "logps/rejected": -363.1499938964844, "loss": 0.548, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.814892590045929, "rewards/margins": 1.089257836341858, "rewards/rejected": -0.27464598417282104, "step": 1780 }, { "epoch": 0.9435951502372166, "grad_norm": 78.64251232507551, "learning_rate": 7.642329994728518e-07, "logits/chosen": 0.8885742425918579, "logits/rejected": 0.786425769329071, "logps/chosen": -341.1499938964844, "logps/rejected": -335.0, "loss": 0.5344, "rewards/accuracies": 0.71875, "rewards/chosen": 0.666015625, "rewards/margins": 1.0358978509902954, "rewards/rejected": -0.36958009004592896, "step": 1790 }, { "epoch": 0.9488666315234581, "grad_norm": 122.89174226705727, "learning_rate": 7.629151291512915e-07, "logits/chosen": 0.67803955078125, "logits/rejected": 0.572021484375, "logps/chosen": -372.1000061035156, "logps/rejected": -345.1000061035156, "loss": 0.5258, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3991455137729645, "rewards/margins": 1.1423828601837158, "rewards/rejected": -0.7421935796737671, "step": 1800 }, { "epoch": 0.9541381128096995, "grad_norm": 120.17117888128965, "learning_rate": 7.615972588297312e-07, "logits/chosen": 0.621478259563446, "logits/rejected": 0.5977538824081421, "logps/chosen": -406.5, "logps/rejected": -366.29998779296875, "loss": 0.4969, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.548046886920929, "rewards/margins": 1.2216796875, "rewards/rejected": -0.674243152141571, "step": 1810 }, { "epoch": 0.959409594095941, "grad_norm": 67.70424377206719, "learning_rate": 7.602793885081708e-07, "logits/chosen": 0.845507800579071, "logits/rejected": 0.7900390625, "logps/chosen": -362.70001220703125, "logps/rejected": -398.95001220703125, "loss": 0.5434, "rewards/accuracies": 0.71875, "rewards/chosen": 0.511975109577179, "rewards/margins": 1.103271484375, "rewards/rejected": -0.5917114019393921, "step": 1820 }, { "epoch": 0.9646810753821824, "grad_norm": 124.81158804008345, "learning_rate": 7.589615181866104e-07, "logits/chosen": 0.8204101324081421, "logits/rejected": 0.7093750238418579, "logps/chosen": -349.1000061035156, "logps/rejected": -328.67498779296875, "loss": 0.4994, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5162719488143921, "rewards/margins": 1.2468750476837158, "rewards/rejected": -0.730395495891571, "step": 1830 }, { "epoch": 0.9699525566684238, "grad_norm": 110.10269010204539, "learning_rate": 7.576436478650501e-07, "logits/chosen": 0.6540771722793579, "logits/rejected": 0.698779284954071, "logps/chosen": -395.3500061035156, "logps/rejected": -329.75, "loss": 0.6214, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.38823240995407104, "rewards/margins": 1.0271484851837158, "rewards/rejected": -0.63922119140625, "step": 1840 }, { "epoch": 0.9752240379546653, "grad_norm": 115.8468511802113, "learning_rate": 7.563257775434897e-07, "logits/chosen": 0.8583984375, "logits/rejected": 0.829882800579071, "logps/chosen": -331.8500061035156, "logps/rejected": -323.29998779296875, "loss": 0.5951, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7133423089981079, "rewards/margins": 1.038427710533142, "rewards/rejected": -0.32475584745407104, "step": 1850 }, { "epoch": 0.9804955192409067, "grad_norm": 69.30696226908901, "learning_rate": 7.550079072219293e-07, "logits/chosen": 0.850604236125946, "logits/rejected": 0.9359375238418579, "logps/chosen": -330.8500061035156, "logps/rejected": -323.5, "loss": 0.6357, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.960253894329071, "rewards/margins": 0.9216552972793579, "rewards/rejected": 0.03891601413488388, "step": 1860 }, { "epoch": 0.9857670005271482, "grad_norm": 111.26021411386425, "learning_rate": 7.53690036900369e-07, "logits/chosen": 0.8726562261581421, "logits/rejected": 0.873828113079071, "logps/chosen": -336.5, "logps/rejected": -314.1499938964844, "loss": 0.5176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.725903332233429, "rewards/margins": 1.035986304283142, "rewards/rejected": -0.310882568359375, "step": 1870 }, { "epoch": 0.9910384818133896, "grad_norm": 62.97223805096217, "learning_rate": 7.523721665788087e-07, "logits/chosen": 0.944042980670929, "logits/rejected": 0.8441406488418579, "logps/chosen": -348.1000061035156, "logps/rejected": -344.79998779296875, "loss": 0.4771, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.692828357219696, "rewards/margins": 1.11004638671875, "rewards/rejected": -0.4177002012729645, "step": 1880 }, { "epoch": 0.996309963099631, "grad_norm": 88.74162231196995, "learning_rate": 7.510542962572483e-07, "logits/chosen": 0.774462878704071, "logits/rejected": 0.723339855670929, "logps/chosen": -358.95001220703125, "logps/rejected": -325.3999938964844, "loss": 0.4592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6785980463027954, "rewards/margins": 1.3522460460662842, "rewards/rejected": -0.673541247844696, "step": 1890 }, { "epoch": 1.0015814443858724, "grad_norm": 16.088198139760788, "learning_rate": 7.497364259356879e-07, "logits/chosen": 0.659832775592804, "logits/rejected": 0.5938476324081421, "logps/chosen": -365.1000061035156, "logps/rejected": -343.5, "loss": 0.4545, "rewards/accuracies": 0.7541666626930237, "rewards/chosen": 0.781054675579071, "rewards/margins": 1.9716308116912842, "rewards/rejected": -1.1892578601837158, "step": 1900 }, { "epoch": 1.006852925672114, "grad_norm": 17.60188788471952, "learning_rate": 7.484185556141276e-07, "logits/chosen": 0.673327624797821, "logits/rejected": 0.616650402545929, "logps/chosen": -374.95001220703125, "logps/rejected": -401.1499938964844, "loss": 0.0743, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.851953148841858, "rewards/margins": 4.247656345367432, "rewards/rejected": -2.396484375, "step": 1910 }, { "epoch": 1.0121244069583553, "grad_norm": 25.889122352315454, "learning_rate": 7.471006852925671e-07, "logits/chosen": 0.687451183795929, "logits/rejected": 0.510040283203125, "logps/chosen": -352.75, "logps/rejected": -411.45001220703125, "loss": 0.1151, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5076172351837158, "rewards/margins": 4.313281059265137, "rewards/rejected": -2.8062500953674316, "step": 1920 }, { "epoch": 1.0173958882445968, "grad_norm": 31.279476792629776, "learning_rate": 7.457828149710068e-07, "logits/chosen": 0.5005553960800171, "logits/rejected": 0.4222396910190582, "logps/chosen": -350.20001220703125, "logps/rejected": -382.95001220703125, "loss": 0.1198, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.0436522960662842, "rewards/margins": 4.140625, "rewards/rejected": -3.0960936546325684, "step": 1930 }, { "epoch": 1.0226673695308381, "grad_norm": 31.440028572636166, "learning_rate": 7.444649446494464e-07, "logits/chosen": 0.32573240995407104, "logits/rejected": 0.17266845703125, "logps/chosen": -361.29998779296875, "logps/rejected": -372.1000061035156, "loss": 0.0942, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.658905029296875, "rewards/margins": 4.713281154632568, "rewards/rejected": -4.053124904632568, "step": 1940 }, { "epoch": 1.0279388508170797, "grad_norm": 41.3116989079319, "learning_rate": 7.431470743278861e-07, "logits/chosen": 0.4408020079135895, "logits/rejected": 0.3642944395542145, "logps/chosen": -344.4750061035156, "logps/rejected": -359.1000061035156, "loss": 0.1072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.100653052330017, "rewards/margins": 4.516406059265137, "rewards/rejected": -3.41796875, "step": 1950 }, { "epoch": 1.033210332103321, "grad_norm": 46.68216987794611, "learning_rate": 7.418292040063257e-07, "logits/chosen": 0.6499267816543579, "logits/rejected": 0.569750964641571, "logps/chosen": -360.1000061035156, "logps/rejected": -358.04998779296875, "loss": 0.108, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.450170874595642, "rewards/margins": 3.867968797683716, "rewards/rejected": -2.416796922683716, "step": 1960 }, { "epoch": 1.0384818133895624, "grad_norm": 26.544920264567292, "learning_rate": 7.405113336847653e-07, "logits/chosen": 0.60675048828125, "logits/rejected": 0.6357421875, "logps/chosen": -367.45001220703125, "logps/rejected": -345.79998779296875, "loss": 0.0997, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.688085913658142, "rewards/margins": 4.015625, "rewards/rejected": -2.327343702316284, "step": 1970 }, { "epoch": 1.043753294675804, "grad_norm": 17.686941389994093, "learning_rate": 7.39193463363205e-07, "logits/chosen": 0.6263496279716492, "logits/rejected": 0.625292956829071, "logps/chosen": -347.3999938964844, "logps/rejected": -394.04998779296875, "loss": 0.0869, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1285157203674316, "rewards/margins": 4.5546875, "rewards/rejected": -2.42578125, "step": 1980 }, { "epoch": 1.0490247759620452, "grad_norm": 16.235841608600182, "learning_rate": 7.378755930416447e-07, "logits/chosen": 0.592089831829071, "logits/rejected": 0.4837890565395355, "logps/chosen": -369.04998779296875, "logps/rejected": -355.0, "loss": 0.0859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8427734375, "rewards/margins": 3.8695311546325684, "rewards/rejected": -2.0263671875, "step": 1990 }, { "epoch": 1.0542962572482868, "grad_norm": 89.3855227637759, "learning_rate": 7.365577227200843e-07, "logits/chosen": 0.435546875, "logits/rejected": 0.215800479054451, "logps/chosen": -304.4750061035156, "logps/rejected": -319.0, "loss": 0.1095, "rewards/accuracies": 0.96875, "rewards/chosen": 0.98876953125, "rewards/margins": 4.110937595367432, "rewards/rejected": -3.1214842796325684, "step": 2000 }, { "epoch": 1.0595677385345281, "grad_norm": 31.329944969172885, "learning_rate": 7.352398523985239e-07, "logits/chosen": 0.32185059785842896, "logits/rejected": 0.21956177055835724, "logps/chosen": -329.04998779296875, "logps/rejected": -362.3500061035156, "loss": 0.1249, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4353881776332855, "rewards/margins": 4.181250095367432, "rewards/rejected": -3.7421875, "step": 2010 }, { "epoch": 1.0648392198207697, "grad_norm": 37.576349074433864, "learning_rate": 7.339219820769635e-07, "logits/chosen": 0.24242553114891052, "logits/rejected": 0.3189330995082855, "logps/chosen": -339.79998779296875, "logps/rejected": -371.29998779296875, "loss": 0.1056, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9930664300918579, "rewards/margins": 4.28125, "rewards/rejected": -3.290234327316284, "step": 2020 }, { "epoch": 1.070110701107011, "grad_norm": 14.829965548517086, "learning_rate": 7.326041117554032e-07, "logits/chosen": 0.3291992247104645, "logits/rejected": 0.29248046875, "logps/chosen": -321.54998779296875, "logps/rejected": -357.04998779296875, "loss": 0.0843, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.177880883216858, "rewards/margins": 4.3671875, "rewards/rejected": -3.192187547683716, "step": 2030 }, { "epoch": 1.0753821823932526, "grad_norm": 42.93099339991402, "learning_rate": 7.312862414338429e-07, "logits/chosen": 0.550585925579071, "logits/rejected": 0.3342788815498352, "logps/chosen": -424.95001220703125, "logps/rejected": -396.5, "loss": 0.0901, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3030273914337158, "rewards/margins": 4.771093845367432, "rewards/rejected": -3.475781202316284, "step": 2040 }, { "epoch": 1.080653663679494, "grad_norm": 33.7262806611382, "learning_rate": 7.299683711122825e-07, "logits/chosen": 0.3790832459926605, "logits/rejected": 0.25023192167282104, "logps/chosen": -353.1499938964844, "logps/rejected": -369.04998779296875, "loss": 0.0778, "rewards/accuracies": 0.96875, "rewards/chosen": 0.688916027545929, "rewards/margins": 4.685937404632568, "rewards/rejected": -3.99609375, "step": 2050 }, { "epoch": 1.0859251449657354, "grad_norm": 65.47151034589935, "learning_rate": 7.286505007907222e-07, "logits/chosen": 0.19584961235523224, "logits/rejected": 0.10609130561351776, "logps/chosen": -373.5, "logps/rejected": -373.5, "loss": 0.1215, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4765991270542145, "rewards/margins": 4.483593940734863, "rewards/rejected": -4.004296779632568, "step": 2060 }, { "epoch": 1.0911966262519768, "grad_norm": 50.03039479493285, "learning_rate": 7.273326304691618e-07, "logits/chosen": 0.3067260682582855, "logits/rejected": 0.26470947265625, "logps/chosen": -345.70001220703125, "logps/rejected": -369.45001220703125, "loss": 0.1024, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5346618890762329, "rewards/margins": 4.34375, "rewards/rejected": -3.807812452316284, "step": 2070 }, { "epoch": 1.0964681075382183, "grad_norm": 33.68797222095627, "learning_rate": 7.260147601476014e-07, "logits/chosen": 0.39848631620407104, "logits/rejected": 0.3028320372104645, "logps/chosen": -343.54998779296875, "logps/rejected": -355.1000061035156, "loss": 0.1406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9963134527206421, "rewards/margins": 3.995312452316284, "rewards/rejected": -2.999218702316284, "step": 2080 }, { "epoch": 1.1017395888244597, "grad_norm": 33.47169904698364, "learning_rate": 7.24696889826041e-07, "logits/chosen": 0.34429931640625, "logits/rejected": 0.20577391982078552, "logps/chosen": -300.45001220703125, "logps/rejected": -326.95001220703125, "loss": 0.0997, "rewards/accuracies": 0.96875, "rewards/chosen": 1.051049828529358, "rewards/margins": 4.162499904632568, "rewards/rejected": -3.1117186546325684, "step": 2090 }, { "epoch": 1.1070110701107012, "grad_norm": 55.88605690436924, "learning_rate": 7.233790195044808e-07, "logits/chosen": 0.34877318143844604, "logits/rejected": 0.23103027045726776, "logps/chosen": -379.75, "logps/rejected": -386.6000061035156, "loss": 0.1127, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.095849633216858, "rewards/margins": 4.53515625, "rewards/rejected": -3.4375, "step": 2100 }, { "epoch": 1.1122825513969425, "grad_norm": 10.124102095114793, "learning_rate": 7.220611491829204e-07, "logits/chosen": 0.31928712129592896, "logits/rejected": 0.27174073457717896, "logps/chosen": -376.0, "logps/rejected": -402.20001220703125, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 1.0074462890625, "rewards/margins": 4.58203125, "rewards/rejected": -3.5746092796325684, "step": 2110 }, { "epoch": 1.1175540326831839, "grad_norm": 128.18727411955538, "learning_rate": 7.2074327886136e-07, "logits/chosen": 0.3551025390625, "logits/rejected": 0.28703612089157104, "logps/chosen": -342.17498779296875, "logps/rejected": -368.6499938964844, "loss": 0.122, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.97662353515625, "rewards/margins": 4.143750190734863, "rewards/rejected": -3.1683592796325684, "step": 2120 }, { "epoch": 1.1228255139694254, "grad_norm": 36.618961629049274, "learning_rate": 7.194254085397996e-07, "logits/chosen": 0.47114259004592896, "logits/rejected": 0.47633057832717896, "logps/chosen": -354.20001220703125, "logps/rejected": -397.6000061035156, "loss": 0.1067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.213598608970642, "rewards/margins": 4.28125, "rewards/rejected": -3.0718750953674316, "step": 2130 }, { "epoch": 1.1280969952556668, "grad_norm": 58.16683825974097, "learning_rate": 7.181075382182393e-07, "logits/chosen": 0.4850097596645355, "logits/rejected": 0.3153930604457855, "logps/chosen": -359.45001220703125, "logps/rejected": -365.20001220703125, "loss": 0.1012, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.038183569908142, "rewards/margins": 4.214062690734863, "rewards/rejected": -3.174999952316284, "step": 2140 }, { "epoch": 1.1333684765419083, "grad_norm": 20.907046796644288, "learning_rate": 7.16789667896679e-07, "logits/chosen": 0.42564696073532104, "logits/rejected": 0.3066848814487457, "logps/chosen": -370.75, "logps/rejected": -402.3999938964844, "loss": 0.0811, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.227148413658142, "rewards/margins": 5.256249904632568, "rewards/rejected": -4.028906345367432, "step": 2150 }, { "epoch": 1.1386399578281496, "grad_norm": 42.74888547605807, "learning_rate": 7.154717975751186e-07, "logits/chosen": 0.14150390028953552, "logits/rejected": 0.22476807236671448, "logps/chosen": -345.75, "logps/rejected": -382.8999938964844, "loss": 0.1077, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.799182116985321, "rewards/margins": 4.428124904632568, "rewards/rejected": -3.6328125, "step": 2160 }, { "epoch": 1.1439114391143912, "grad_norm": 20.440324662329008, "learning_rate": 7.141539272535582e-07, "logits/chosen": 0.3558349609375, "logits/rejected": 0.24187469482421875, "logps/chosen": -388.0, "logps/rejected": -390.04998779296875, "loss": 0.0826, "rewards/accuracies": 0.96875, "rewards/chosen": 1.113244652748108, "rewards/margins": 4.461718559265137, "rewards/rejected": -3.350781202316284, "step": 2170 }, { "epoch": 1.1491829204006325, "grad_norm": 45.483533659263436, "learning_rate": 7.128360569319979e-07, "logits/chosen": 0.41059571504592896, "logits/rejected": 0.2563720643520355, "logps/chosen": -348.95001220703125, "logps/rejected": -350.75, "loss": 0.1257, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.100610375404358, "rewards/margins": 3.9664063453674316, "rewards/rejected": -2.86328125, "step": 2180 }, { "epoch": 1.154454401686874, "grad_norm": 9.207710132833915, "learning_rate": 7.115181866104375e-07, "logits/chosen": 0.4508407711982727, "logits/rejected": 0.3493194580078125, "logps/chosen": -353.9750061035156, "logps/rejected": -353.70001220703125, "loss": 0.0932, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.3916015625, "rewards/margins": 4.4140625, "rewards/rejected": -3.021484375, "step": 2190 }, { "epoch": 1.1597258829731154, "grad_norm": 27.035957378796006, "learning_rate": 7.102003162888771e-07, "logits/chosen": 0.3789428770542145, "logits/rejected": 0.205647274851799, "logps/chosen": -362.04998779296875, "logps/rejected": -380.3500061035156, "loss": 0.0983, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2262451648712158, "rewards/margins": 4.7578125, "rewards/rejected": -3.532421827316284, "step": 2200 }, { "epoch": 1.164997364259357, "grad_norm": 489.466823986554, "learning_rate": 7.088824459673169e-07, "logits/chosen": 0.3777938783168793, "logits/rejected": 0.20441894233226776, "logps/chosen": -365.20001220703125, "logps/rejected": -351.54998779296875, "loss": 0.1049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.077978491783142, "rewards/margins": 4.542187690734863, "rewards/rejected": -3.460156202316284, "step": 2210 }, { "epoch": 1.1702688455455983, "grad_norm": 15.596214560124267, "learning_rate": 7.075645756457565e-07, "logits/chosen": 0.31895750761032104, "logits/rejected": 0.19613036513328552, "logps/chosen": -371.70001220703125, "logps/rejected": -364.6000061035156, "loss": 0.0944, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.1073486804962158, "rewards/margins": 4.672656059265137, "rewards/rejected": -3.5648436546325684, "step": 2220 }, { "epoch": 1.1755403268318398, "grad_norm": 43.376379129455934, "learning_rate": 7.062467053241961e-07, "logits/chosen": 0.30566102266311646, "logits/rejected": 0.2599441409111023, "logps/chosen": -343.54998779296875, "logps/rejected": -367.29998779296875, "loss": 0.1182, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.974841296672821, "rewards/margins": 4.569531440734863, "rewards/rejected": -3.592968702316284, "step": 2230 }, { "epoch": 1.1808118081180812, "grad_norm": 72.24734650629144, "learning_rate": 7.049288350026357e-07, "logits/chosen": 0.2781723141670227, "logits/rejected": 0.09237060695886612, "logps/chosen": -347.0, "logps/rejected": -348.3500061035156, "loss": 0.1064, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6849120855331421, "rewards/margins": 4.4453125, "rewards/rejected": -3.7593750953674316, "step": 2240 }, { "epoch": 1.1860832894043227, "grad_norm": 11.531662712321923, "learning_rate": 7.036109646810754e-07, "logits/chosen": 0.3675537109375, "logits/rejected": 0.40211182832717896, "logps/chosen": -345.5, "logps/rejected": -391.1499938964844, "loss": 0.1237, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9292968511581421, "rewards/margins": 4.367968559265137, "rewards/rejected": -3.4398436546325684, "step": 2250 }, { "epoch": 1.191354770690564, "grad_norm": 41.80695828201633, "learning_rate": 7.02293094359515e-07, "logits/chosen": 0.41076356172561646, "logits/rejected": 0.22617188096046448, "logps/chosen": -376.95001220703125, "logps/rejected": -391.25, "loss": 0.0969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7764037847518921, "rewards/margins": 4.56640625, "rewards/rejected": -3.7890625, "step": 2260 }, { "epoch": 1.1966262519768054, "grad_norm": 50.690300617427795, "learning_rate": 7.009752240379547e-07, "logits/chosen": 0.19973143935203552, "logits/rejected": 0.15718993544578552, "logps/chosen": -379.1499938964844, "logps/rejected": -413.20001220703125, "loss": 0.1035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.584887683391571, "rewards/margins": 4.625, "rewards/rejected": -4.039843559265137, "step": 2270 }, { "epoch": 1.201897733263047, "grad_norm": 12.781942648259916, "learning_rate": 6.996573537163942e-07, "logits/chosen": 0.27339476346969604, "logits/rejected": 0.09674072265625, "logps/chosen": -403.3500061035156, "logps/rejected": -416.5, "loss": 0.0672, "rewards/accuracies": 0.96875, "rewards/chosen": 0.12348632514476776, "rewards/margins": 5.609375, "rewards/rejected": -5.482812404632568, "step": 2280 }, { "epoch": 1.2071692145492883, "grad_norm": 10.374939973433365, "learning_rate": 6.98339483394834e-07, "logits/chosen": 0.24887695908546448, "logits/rejected": 0.11496581882238388, "logps/chosen": -367.5, "logps/rejected": -390.0, "loss": 0.1068, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10139770805835724, "rewards/margins": 4.703906059265137, "rewards/rejected": -4.603906154632568, "step": 2290 }, { "epoch": 1.2124406958355298, "grad_norm": 22.531139450937953, "learning_rate": 6.970216130732735e-07, "logits/chosen": 0.13420410454273224, "logits/rejected": 0.14774170517921448, "logps/chosen": -329.1000061035156, "logps/rejected": -352.3500061035156, "loss": 0.0927, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5040649175643921, "rewards/margins": 4.82421875, "rewards/rejected": -4.314843654632568, "step": 2300 }, { "epoch": 1.2177121771217712, "grad_norm": 53.54539448785891, "learning_rate": 6.957037427517131e-07, "logits/chosen": 0.2779296934604645, "logits/rejected": 0.24639892578125, "logps/chosen": -378.1000061035156, "logps/rejected": -359.04998779296875, "loss": 0.0911, "rewards/accuracies": 0.96875, "rewards/chosen": 0.813793957233429, "rewards/margins": 4.627343654632568, "rewards/rejected": -3.8109374046325684, "step": 2310 }, { "epoch": 1.2229836584080127, "grad_norm": 13.964884868368523, "learning_rate": 6.943858724301529e-07, "logits/chosen": 0.4254394471645355, "logits/rejected": 0.28338623046875, "logps/chosen": -311.29998779296875, "logps/rejected": -347.29998779296875, "loss": 0.1, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.98095703125, "rewards/margins": 4.291406154632568, "rewards/rejected": -3.309375047683716, "step": 2320 }, { "epoch": 1.228255139694254, "grad_norm": 26.963360187264595, "learning_rate": 6.930680021085925e-07, "logits/chosen": 0.4260009825229645, "logits/rejected": 0.39996337890625, "logps/chosen": -360.75, "logps/rejected": -392.0, "loss": 0.1025, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.0859863758087158, "rewards/margins": 4.611718654632568, "rewards/rejected": -3.522656202316284, "step": 2330 }, { "epoch": 1.2335266209804956, "grad_norm": 47.96024817717915, "learning_rate": 6.917501317870321e-07, "logits/chosen": 0.30187225341796875, "logits/rejected": 0.3317504823207855, "logps/chosen": -330.3999938964844, "logps/rejected": -332.54998779296875, "loss": 0.1374, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.1018555164337158, "rewards/margins": 4.216015815734863, "rewards/rejected": -3.11328125, "step": 2340 }, { "epoch": 1.238798102266737, "grad_norm": 46.239785885437115, "learning_rate": 6.904322614654717e-07, "logits/chosen": 0.2776855528354645, "logits/rejected": 0.17962646484375, "logps/chosen": -381.1499938964844, "logps/rejected": -333.8999938964844, "loss": 0.1041, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0890991687774658, "rewards/margins": 4.411718845367432, "rewards/rejected": -3.323437452316284, "step": 2350 }, { "epoch": 1.2440695835529785, "grad_norm": 26.26855239720641, "learning_rate": 6.891143911439114e-07, "logits/chosen": 0.33269041776657104, "logits/rejected": 0.37109375, "logps/chosen": -326.5, "logps/rejected": -363.95001220703125, "loss": 0.1794, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3065398931503296, "rewards/margins": 4.289843559265137, "rewards/rejected": -2.9828124046325684, "step": 2360 }, { "epoch": 1.2493410648392198, "grad_norm": 26.437180197486924, "learning_rate": 6.87796520822351e-07, "logits/chosen": 0.33928221464157104, "logits/rejected": 0.4745849668979645, "logps/chosen": -352.25, "logps/rejected": -387.95001220703125, "loss": 0.0802, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.029687523841858, "rewards/margins": 4.541406154632568, "rewards/rejected": -3.508593797683716, "step": 2370 }, { "epoch": 1.2546125461254611, "grad_norm": 51.67015117701914, "learning_rate": 6.864786505007907e-07, "logits/chosen": 0.3515625, "logits/rejected": 0.22679634392261505, "logps/chosen": -347.29998779296875, "logps/rejected": -386.0, "loss": 0.0928, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.785491943359375, "rewards/margins": 4.711718559265137, "rewards/rejected": -3.9312500953674316, "step": 2380 }, { "epoch": 1.2598840274117027, "grad_norm": 36.25019084544046, "learning_rate": 6.851607801792303e-07, "logits/chosen": 0.21063843369483948, "logits/rejected": 0.19803467392921448, "logps/chosen": -347.70001220703125, "logps/rejected": -348.1499938964844, "loss": 0.0971, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6181274652481079, "rewards/margins": 4.265625, "rewards/rejected": -3.653125047683716, "step": 2390 }, { "epoch": 1.2651555086979442, "grad_norm": 32.58630005394569, "learning_rate": 6.8384290985767e-07, "logits/chosen": 0.32709962129592896, "logits/rejected": 0.153797909617424, "logps/chosen": -369.95001220703125, "logps/rejected": -385.6499938964844, "loss": 0.1022, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.450827032327652, "rewards/margins": 4.390625, "rewards/rejected": -3.938281297683716, "step": 2400 }, { "epoch": 1.2704269899841856, "grad_norm": 52.84124130589642, "learning_rate": 6.825250395361096e-07, "logits/chosen": 0.35066527128219604, "logits/rejected": 0.13029174506664276, "logps/chosen": -332.25, "logps/rejected": -359.95001220703125, "loss": 0.1022, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6614745855331421, "rewards/margins": 4.8046875, "rewards/rejected": -4.140625, "step": 2410 }, { "epoch": 1.275698471270427, "grad_norm": 18.710939811662293, "learning_rate": 6.812071692145492e-07, "logits/chosen": 0.2551635801792145, "logits/rejected": 0.21240234375, "logps/chosen": -383.20001220703125, "logps/rejected": -402.75, "loss": 0.0514, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.908007800579071, "rewards/margins": 5.219531059265137, "rewards/rejected": -4.314062595367432, "step": 2420 }, { "epoch": 1.2809699525566685, "grad_norm": 27.34773258752063, "learning_rate": 6.798892988929888e-07, "logits/chosen": 0.38092344999313354, "logits/rejected": 0.25075989961624146, "logps/chosen": -344.75, "logps/rejected": -359.25, "loss": 0.0876, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.691943347454071, "rewards/margins": 4.34765625, "rewards/rejected": -3.655468702316284, "step": 2430 }, { "epoch": 1.2862414338429098, "grad_norm": 22.9223972958348, "learning_rate": 6.785714285714286e-07, "logits/chosen": 0.339569091796875, "logits/rejected": 0.15089111030101776, "logps/chosen": -369.6499938964844, "logps/rejected": -356.95001220703125, "loss": 0.084, "rewards/accuracies": 0.96875, "rewards/chosen": 0.948840320110321, "rewards/margins": 4.65625, "rewards/rejected": -3.7046875953674316, "step": 2440 }, { "epoch": 1.2915129151291513, "grad_norm": 73.58063242289221, "learning_rate": 6.772535582498682e-07, "logits/chosen": 0.3430847227573395, "logits/rejected": 0.19711914658546448, "logps/chosen": -382.45001220703125, "logps/rejected": -385.6000061035156, "loss": 0.1201, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0611572265625, "rewards/margins": 5.0, "rewards/rejected": -3.940624952316284, "step": 2450 }, { "epoch": 1.2967843964153927, "grad_norm": 31.177178062532505, "learning_rate": 6.759356879283078e-07, "logits/chosen": 0.4441894590854645, "logits/rejected": 0.3541626036167145, "logps/chosen": -368.1000061035156, "logps/rejected": -414.1000061035156, "loss": 0.0953, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.142578125, "rewards/margins": 4.740624904632568, "rewards/rejected": -3.5953125953674316, "step": 2460 }, { "epoch": 1.3020558777016342, "grad_norm": 85.44355213494322, "learning_rate": 6.746178176067475e-07, "logits/chosen": 0.4015258848667145, "logits/rejected": 0.4278198182582855, "logps/chosen": -349.04998779296875, "logps/rejected": -375.04998779296875, "loss": 0.0982, "rewards/accuracies": 0.96875, "rewards/chosen": 1.481347680091858, "rewards/margins": 4.6484375, "rewards/rejected": -3.166015625, "step": 2470 }, { "epoch": 1.3073273589878756, "grad_norm": 32.979275315125534, "learning_rate": 6.732999472851871e-07, "logits/chosen": 0.3721252381801605, "logits/rejected": 0.3237670958042145, "logps/chosen": -315.5, "logps/rejected": -314.25, "loss": 0.1887, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.046533226966858, "rewards/margins": 4.250390529632568, "rewards/rejected": -3.2027344703674316, "step": 2480 }, { "epoch": 1.312598840274117, "grad_norm": 37.075524966425434, "learning_rate": 6.719820769636268e-07, "logits/chosen": 0.31951904296875, "logits/rejected": 0.23160400986671448, "logps/chosen": -356.1000061035156, "logps/rejected": -397.20001220703125, "loss": 0.0947, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.175927758216858, "rewards/margins": 4.594531059265137, "rewards/rejected": -3.421875, "step": 2490 }, { "epoch": 1.3178703215603584, "grad_norm": 13.648654269872358, "learning_rate": 6.706642066420664e-07, "logits/chosen": 0.35185545682907104, "logits/rejected": 0.30177003145217896, "logps/chosen": -416.70001220703125, "logps/rejected": -429.3999938964844, "loss": 0.0812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.175878882408142, "rewards/margins": 5.3515625, "rewards/rejected": -4.170312404632568, "step": 2500 }, { "epoch": 1.3231418028466, "grad_norm": 14.264672683379771, "learning_rate": 6.693463363205061e-07, "logits/chosen": 0.07253418117761612, "logits/rejected": -0.05994873121380806, "logps/chosen": -394.54998779296875, "logps/rejected": -392.1499938964844, "loss": 0.1045, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.12486572563648224, "rewards/margins": 4.775000095367432, "rewards/rejected": -4.653906345367432, "step": 2510 }, { "epoch": 1.3284132841328413, "grad_norm": 20.480837430089302, "learning_rate": 6.680284659989457e-07, "logits/chosen": 0.13726195693016052, "logits/rejected": -0.05324707180261612, "logps/chosen": -379.29998779296875, "logps/rejected": -369.20001220703125, "loss": 0.1033, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04422607272863388, "rewards/margins": 4.721875190734863, "rewards/rejected": -4.678906440734863, "step": 2520 }, { "epoch": 1.3336847654190827, "grad_norm": 15.220112902711948, "learning_rate": 6.667105956773853e-07, "logits/chosen": 0.21956786513328552, "logits/rejected": -0.005261230282485485, "logps/chosen": -372.29998779296875, "logps/rejected": -376.95001220703125, "loss": 0.0999, "rewards/accuracies": 0.96875, "rewards/chosen": 0.157379150390625, "rewards/margins": 5.036718845367432, "rewards/rejected": -4.874218940734863, "step": 2530 }, { "epoch": 1.3389562467053242, "grad_norm": 21.90493276393229, "learning_rate": 6.653927253558249e-07, "logits/chosen": 0.2512350082397461, "logits/rejected": 0.16865234076976776, "logps/chosen": -401.29998779296875, "logps/rejected": -414.0, "loss": 0.104, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4837646484375, "rewards/margins": 4.875781059265137, "rewards/rejected": -4.389062404632568, "step": 2540 }, { "epoch": 1.3442277279915658, "grad_norm": 31.182499456562848, "learning_rate": 6.640748550342647e-07, "logits/chosen": 0.22647705674171448, "logits/rejected": 0.11135254055261612, "logps/chosen": -325.75, "logps/rejected": -334.45001220703125, "loss": 0.0879, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.630780041217804, "rewards/margins": 4.493750095367432, "rewards/rejected": -3.8578124046325684, "step": 2550 }, { "epoch": 1.349499209277807, "grad_norm": 27.969313373748033, "learning_rate": 6.627569847127043e-07, "logits/chosen": 0.19984741508960724, "logits/rejected": 0.12531737983226776, "logps/chosen": -352.8999938964844, "logps/rejected": -375.6499938964844, "loss": 0.1021, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.572509765625, "rewards/margins": 4.6171875, "rewards/rejected": -4.041406154632568, "step": 2560 }, { "epoch": 1.3547706905640484, "grad_norm": 75.63233581786392, "learning_rate": 6.614391143911439e-07, "logits/chosen": 0.14654541015625, "logits/rejected": 0.07712402194738388, "logps/chosen": -311.0, "logps/rejected": -359.70001220703125, "loss": 0.1041, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.597796618938446, "rewards/margins": 4.538281440734863, "rewards/rejected": -3.94140625, "step": 2570 }, { "epoch": 1.36004217185029, "grad_norm": 38.180971275861154, "learning_rate": 6.601212440695835e-07, "logits/chosen": 0.2604522705078125, "logits/rejected": 0.04378356784582138, "logps/chosen": -405.0, "logps/rejected": -416.1499938964844, "loss": 0.1217, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.7610839605331421, "rewards/margins": 5.185156345367432, "rewards/rejected": -4.421093940734863, "step": 2580 }, { "epoch": 1.3653136531365313, "grad_norm": 33.50991290627085, "learning_rate": 6.588033737480232e-07, "logits/chosen": 0.2532958984375, "logits/rejected": 0.18348388373851776, "logps/chosen": -360.3999938964844, "logps/rejected": -391.8500061035156, "loss": 0.1143, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.921191394329071, "rewards/margins": 4.640625, "rewards/rejected": -3.7222657203674316, "step": 2590 }, { "epoch": 1.3705851344227729, "grad_norm": 32.75535087732039, "learning_rate": 6.574855034264628e-07, "logits/chosen": 0.3452819883823395, "logits/rejected": 0.3809570372104645, "logps/chosen": -383.3999938964844, "logps/rejected": -420.8999938964844, "loss": 0.0859, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1107909679412842, "rewards/margins": 4.927343845367432, "rewards/rejected": -3.813281297683716, "step": 2600 }, { "epoch": 1.3758566157090142, "grad_norm": 45.6062311458944, "learning_rate": 6.561676331049025e-07, "logits/chosen": 0.4324951171875, "logits/rejected": 0.23875732719898224, "logps/chosen": -395.3500061035156, "logps/rejected": -413.25, "loss": 0.0937, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.881182849407196, "rewards/margins": 4.918749809265137, "rewards/rejected": -4.039843559265137, "step": 2610 }, { "epoch": 1.3811280969952557, "grad_norm": 33.33890756130534, "learning_rate": 6.548497627833422e-07, "logits/chosen": 0.20543518662452698, "logits/rejected": 0.05928955227136612, "logps/chosen": -337.6000061035156, "logps/rejected": -377.7250061035156, "loss": 0.1144, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5514160394668579, "rewards/margins": 4.469531059265137, "rewards/rejected": -3.920703172683716, "step": 2620 }, { "epoch": 1.386399578281497, "grad_norm": 8.084768511557806, "learning_rate": 6.535318924617818e-07, "logits/chosen": 0.15636596083641052, "logits/rejected": 0.23785400390625, "logps/chosen": -347.0, "logps/rejected": -373.20001220703125, "loss": 0.0678, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5383545160293579, "rewards/margins": 5.131249904632568, "rewards/rejected": -4.590624809265137, "step": 2630 }, { "epoch": 1.3916710595677384, "grad_norm": 32.00153776342143, "learning_rate": 6.522140221402213e-07, "logits/chosen": 0.2607421875, "logits/rejected": 0.10794677585363388, "logps/chosen": -351.5, "logps/rejected": -380.54998779296875, "loss": 0.0784, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.8244994878768921, "rewards/margins": 5.117968559265137, "rewards/rejected": -4.289843559265137, "step": 2640 }, { "epoch": 1.39694254085398, "grad_norm": 33.63247788970677, "learning_rate": 6.508961518186609e-07, "logits/chosen": 0.13396605849266052, "logits/rejected": -0.0029296875, "logps/chosen": -361.45001220703125, "logps/rejected": -389.45001220703125, "loss": 0.0922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.783703625202179, "rewards/margins": 5.078906059265137, "rewards/rejected": -4.293749809265137, "step": 2650 }, { "epoch": 1.4022140221402215, "grad_norm": 36.64563909023535, "learning_rate": 6.495782814971007e-07, "logits/chosen": 0.3492431640625, "logits/rejected": 0.23748779296875, "logps/chosen": -391.5, "logps/rejected": -426.8500061035156, "loss": 0.0893, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.672839343547821, "rewards/margins": 4.90234375, "rewards/rejected": -4.232812404632568, "step": 2660 }, { "epoch": 1.4074855034264628, "grad_norm": 40.112065126898486, "learning_rate": 6.482604111755403e-07, "logits/chosen": 0.209320068359375, "logits/rejected": 0.14488525688648224, "logps/chosen": -350.8500061035156, "logps/rejected": -396.8999938964844, "loss": 0.089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.533923327922821, "rewards/margins": 4.698437690734863, "rewards/rejected": -4.161718845367432, "step": 2670 }, { "epoch": 1.4127569847127042, "grad_norm": 14.555018115565822, "learning_rate": 6.469425408539799e-07, "logits/chosen": 0.36958009004592896, "logits/rejected": 0.16191406548023224, "logps/chosen": -354.25, "logps/rejected": -392.54998779296875, "loss": 0.1024, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.25666505098342896, "rewards/margins": 5.013281345367432, "rewards/rejected": -4.760156154632568, "step": 2680 }, { "epoch": 1.4180284659989457, "grad_norm": 15.368737946642087, "learning_rate": 6.456246705324195e-07, "logits/chosen": 0.2560180723667145, "logits/rejected": 0.27061766386032104, "logps/chosen": -399.5, "logps/rejected": -399.6000061035156, "loss": 0.091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6278442144393921, "rewards/margins": 5.224218845367432, "rewards/rejected": -4.600781440734863, "step": 2690 }, { "epoch": 1.4232999472851873, "grad_norm": 22.28476475911154, "learning_rate": 6.443068002108592e-07, "logits/chosen": 0.11110229790210724, "logits/rejected": 0.15610961616039276, "logps/chosen": -371.95001220703125, "logps/rejected": -390.54998779296875, "loss": 0.082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.683764636516571, "rewards/margins": 5.046093940734863, "rewards/rejected": -4.362500190734863, "step": 2700 }, { "epoch": 1.4285714285714286, "grad_norm": 52.20545246809938, "learning_rate": 6.429889298892988e-07, "logits/chosen": 0.266021728515625, "logits/rejected": 0.14592285454273224, "logps/chosen": -374.6499938964844, "logps/rejected": -372.70001220703125, "loss": 0.0876, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1679198741912842, "rewards/margins": 4.932812690734863, "rewards/rejected": -3.760937452316284, "step": 2710 }, { "epoch": 1.43384290985767, "grad_norm": 12.98393019936555, "learning_rate": 6.416710595677385e-07, "logits/chosen": 0.4103759825229645, "logits/rejected": 0.2829833924770355, "logps/chosen": -385.25, "logps/rejected": -411.25, "loss": 0.0798, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.7177978754043579, "rewards/margins": 5.028124809265137, "rewards/rejected": -4.309374809265137, "step": 2720 }, { "epoch": 1.4391143911439115, "grad_norm": 14.22712543410305, "learning_rate": 6.403531892461781e-07, "logits/chosen": 0.3114074766635895, "logits/rejected": 0.07757568359375, "logps/chosen": -394.29998779296875, "logps/rejected": -378.0, "loss": 0.0974, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5280395746231079, "rewards/margins": 5.025781154632568, "rewards/rejected": -4.500781059265137, "step": 2730 }, { "epoch": 1.4443858724301528, "grad_norm": 45.317966971840846, "learning_rate": 6.390353189246178e-07, "logits/chosen": 0.3033508360385895, "logits/rejected": 0.156280517578125, "logps/chosen": -435.45001220703125, "logps/rejected": -428.6000061035156, "loss": 0.0936, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.47368162870407104, "rewards/margins": 5.544531345367432, "rewards/rejected": -5.076562404632568, "step": 2740 }, { "epoch": 1.4496573537163944, "grad_norm": 14.742406704074476, "learning_rate": 6.377174486030574e-07, "logits/chosen": 0.08431396633386612, "logits/rejected": 0.06375732272863388, "logps/chosen": -388.70001220703125, "logps/rejected": -421.8999938964844, "loss": 0.0915, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06577148288488388, "rewards/margins": 5.209374904632568, "rewards/rejected": -5.142968654632568, "step": 2750 }, { "epoch": 1.4549288350026357, "grad_norm": 44.41642924537191, "learning_rate": 6.36399578281497e-07, "logits/chosen": 0.01577148400247097, "logits/rejected": -0.01479187048971653, "logps/chosen": -323.45001220703125, "logps/rejected": -347.54998779296875, "loss": 0.1291, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.11494140326976776, "rewards/margins": 4.517187595367432, "rewards/rejected": -4.401562690734863, "step": 2760 }, { "epoch": 1.4602003162888773, "grad_norm": 26.246919636716864, "learning_rate": 6.350817079599367e-07, "logits/chosen": 0.12202148139476776, "logits/rejected": 0.07639160007238388, "logps/chosen": -360.8500061035156, "logps/rejected": -412.79998779296875, "loss": 0.0655, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.27326661348342896, "rewards/margins": 5.404687404632568, "rewards/rejected": -5.134375095367432, "step": 2770 }, { "epoch": 1.4654717975751186, "grad_norm": 77.95756478914929, "learning_rate": 6.337638376383764e-07, "logits/chosen": 0.11948242038488388, "logits/rejected": 0.0955810546875, "logps/chosen": -367.3999938964844, "logps/rejected": -360.20001220703125, "loss": 0.1263, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.19144287705421448, "rewards/margins": 4.5, "rewards/rejected": -4.309374809265137, "step": 2780 }, { "epoch": 1.47074327886136, "grad_norm": 30.548293530893343, "learning_rate": 6.32445967316816e-07, "logits/chosen": 0.08482666313648224, "logits/rejected": 0.10136719048023224, "logps/chosen": -381.0, "logps/rejected": -394.54998779296875, "loss": 0.1003, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5385192632675171, "rewards/margins": 5.242968559265137, "rewards/rejected": -4.70703125, "step": 2790 }, { "epoch": 1.4760147601476015, "grad_norm": 66.45581461393738, "learning_rate": 6.311280969952556e-07, "logits/chosen": 0.17142334580421448, "logits/rejected": 0.16786804795265198, "logps/chosen": -337.20001220703125, "logps/rejected": -383.20001220703125, "loss": 0.101, "rewards/accuracies": 0.96875, "rewards/chosen": 0.588409423828125, "rewards/margins": 4.510937690734863, "rewards/rejected": -3.921875, "step": 2800 }, { "epoch": 1.481286241433843, "grad_norm": 39.741037266224055, "learning_rate": 6.298102266736953e-07, "logits/chosen": 0.27617186307907104, "logits/rejected": 0.16939087212085724, "logps/chosen": -371.8500061035156, "logps/rejected": -387.8999938964844, "loss": 0.0914, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.649707019329071, "rewards/margins": 4.530468940734863, "rewards/rejected": -3.87890625, "step": 2810 }, { "epoch": 1.4865577227200844, "grad_norm": 36.939452789726275, "learning_rate": 6.284923563521349e-07, "logits/chosen": 0.09327392280101776, "logits/rejected": 0.01839599572122097, "logps/chosen": -357.1000061035156, "logps/rejected": -349.1499938964844, "loss": 0.1322, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.4063720703125, "rewards/margins": 4.318749904632568, "rewards/rejected": -3.9117188453674316, "step": 2820 }, { "epoch": 1.4918292040063257, "grad_norm": 16.635218979855136, "learning_rate": 6.271744860305746e-07, "logits/chosen": 0.12531737983226776, "logits/rejected": 0.07185973972082138, "logps/chosen": -331.54998779296875, "logps/rejected": -326.75, "loss": 0.0823, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8817383050918579, "rewards/margins": 4.766406059265137, "rewards/rejected": -3.8851561546325684, "step": 2830 }, { "epoch": 1.4971006852925672, "grad_norm": 38.0761890033935, "learning_rate": 6.258566157090142e-07, "logits/chosen": 0.12603759765625, "logits/rejected": -0.014285278506577015, "logps/chosen": -350.3999938964844, "logps/rejected": -387.75, "loss": 0.0819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6032959222793579, "rewards/margins": 5.010937690734863, "rewards/rejected": -4.40625, "step": 2840 }, { "epoch": 1.5023721665788088, "grad_norm": 17.6279747597743, "learning_rate": 6.245387453874539e-07, "logits/chosen": 0.4213623106479645, "logits/rejected": 0.17294616997241974, "logps/chosen": -355.95001220703125, "logps/rejected": -379.1000061035156, "loss": 0.1212, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.921521008014679, "rewards/margins": 4.77734375, "rewards/rejected": -3.852343797683716, "step": 2850 }, { "epoch": 1.5076436478650501, "grad_norm": 46.905576836421055, "learning_rate": 6.232208750658935e-07, "logits/chosen": 0.26524657011032104, "logits/rejected": 0.11777343600988388, "logps/chosen": -397.3999938964844, "logps/rejected": -381.95001220703125, "loss": 0.1171, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.0894454717636108, "rewards/margins": 4.913281440734863, "rewards/rejected": -3.8265624046325684, "step": 2860 }, { "epoch": 1.5129151291512914, "grad_norm": 30.531453879705627, "learning_rate": 6.219030047443331e-07, "logits/chosen": 0.18902587890625, "logits/rejected": 0.00885620154440403, "logps/chosen": -366.5, "logps/rejected": -359.5, "loss": 0.0961, "rewards/accuracies": 0.96875, "rewards/chosen": 1.090600609779358, "rewards/margins": 4.608593940734863, "rewards/rejected": -3.5140624046325684, "step": 2870 }, { "epoch": 1.518186610437533, "grad_norm": 39.28009374380878, "learning_rate": 6.205851344227727e-07, "logits/chosen": 0.21842804551124573, "logits/rejected": 0.243804931640625, "logps/chosen": -357.54998779296875, "logps/rejected": -370.0, "loss": 0.0929, "rewards/accuracies": 0.96875, "rewards/chosen": 0.615551769733429, "rewards/margins": 4.628125190734863, "rewards/rejected": -4.015625, "step": 2880 }, { "epoch": 1.5234580917237743, "grad_norm": 49.492925090021835, "learning_rate": 6.192672641012125e-07, "logits/chosen": 0.2674804627895355, "logits/rejected": 0.17394104599952698, "logps/chosen": -381.1000061035156, "logps/rejected": -394.95001220703125, "loss": 0.1184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6504882574081421, "rewards/margins": 4.846093654632568, "rewards/rejected": -4.197656154632568, "step": 2890 }, { "epoch": 1.5287295730100157, "grad_norm": 16.060895117505524, "learning_rate": 6.179493937796521e-07, "logits/chosen": 0.16739502549171448, "logits/rejected": -0.025421142578125, "logps/chosen": -379.29998779296875, "logps/rejected": -377.25, "loss": 0.0967, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2780517637729645, "rewards/margins": 5.069531440734863, "rewards/rejected": -4.792187690734863, "step": 2900 }, { "epoch": 1.5340010542962572, "grad_norm": 22.460941014312855, "learning_rate": 6.166315234580917e-07, "logits/chosen": 0.17355652153491974, "logits/rejected": 0.05085449293255806, "logps/chosen": -349.6000061035156, "logps/rejected": -392.54998779296875, "loss": 0.0994, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.08826903998851776, "rewards/margins": 5.109375, "rewards/rejected": -5.025000095367432, "step": 2910 }, { "epoch": 1.5392725355824988, "grad_norm": 19.283316559775763, "learning_rate": 6.153136531365314e-07, "logits/chosen": 0.11807403713464737, "logits/rejected": 0.002227783203125, "logps/chosen": -364.0249938964844, "logps/rejected": -371.8500061035156, "loss": 0.1257, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3117431700229645, "rewards/margins": 4.754687309265137, "rewards/rejected": -4.44140625, "step": 2920 }, { "epoch": 1.54454401686874, "grad_norm": 11.670833226044149, "learning_rate": 6.13995782814971e-07, "logits/chosen": 0.06151122972369194, "logits/rejected": -0.17814025282859802, "logps/chosen": -382.3500061035156, "logps/rejected": -371.70001220703125, "loss": 0.0905, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.24077148735523224, "rewards/margins": 5.051562309265137, "rewards/rejected": -4.807812690734863, "step": 2930 }, { "epoch": 1.5498154981549814, "grad_norm": 37.629633999622484, "learning_rate": 6.126779124934106e-07, "logits/chosen": 0.15667724609375, "logits/rejected": -0.08171997219324112, "logps/chosen": -366.29998779296875, "logps/rejected": -390.6000061035156, "loss": 0.116, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.08803711086511612, "rewards/margins": 4.997656345367432, "rewards/rejected": -4.909375190734863, "step": 2940 }, { "epoch": 1.555086979441223, "grad_norm": 33.6625576290004, "learning_rate": 6.113600421718503e-07, "logits/chosen": 0.116058349609375, "logits/rejected": -0.03702392429113388, "logps/chosen": -377.54998779296875, "logps/rejected": -415.54998779296875, "loss": 0.0735, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2766662538051605, "rewards/margins": 5.58203125, "rewards/rejected": -5.303906440734863, "step": 2950 }, { "epoch": 1.5603584607274645, "grad_norm": 17.495006861705534, "learning_rate": 6.1004217185029e-07, "logits/chosen": 0.11605224758386612, "logits/rejected": -0.02458496019244194, "logps/chosen": -378.1000061035156, "logps/rejected": -370.29998779296875, "loss": 0.0905, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.40953367948532104, "rewards/margins": 4.939062595367432, "rewards/rejected": -4.52734375, "step": 2960 }, { "epoch": 1.5656299420137059, "grad_norm": 5.761801826850329, "learning_rate": 6.087243015287296e-07, "logits/chosen": 0.22664794325828552, "logits/rejected": 0.10625610500574112, "logps/chosen": -405.04998779296875, "logps/rejected": -385.79998779296875, "loss": 0.0878, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.076025366783142, "rewards/margins": 5.194531440734863, "rewards/rejected": -4.114062309265137, "step": 2970 }, { "epoch": 1.5709014232999472, "grad_norm": 35.378376529380915, "learning_rate": 6.074064312071692e-07, "logits/chosen": 0.195404052734375, "logits/rejected": -0.01312866248190403, "logps/chosen": -363.75, "logps/rejected": -374.04998779296875, "loss": 0.0922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7314453125, "rewards/margins": 4.90625, "rewards/rejected": -4.172656059265137, "step": 2980 }, { "epoch": 1.5761729045861887, "grad_norm": 44.459799607152796, "learning_rate": 6.060885608856087e-07, "logits/chosen": 0.20328369736671448, "logits/rejected": 0.06578369438648224, "logps/chosen": -399.3999938964844, "logps/rejected": -390.70001220703125, "loss": 0.1011, "rewards/accuracies": 0.96875, "rewards/chosen": 0.710742175579071, "rewards/margins": 5.205468654632568, "rewards/rejected": -4.495312690734863, "step": 2990 }, { "epoch": 1.5814443858724303, "grad_norm": 23.96691185806833, "learning_rate": 6.047706905640486e-07, "logits/chosen": 0.25786131620407104, "logits/rejected": 0.07260742038488388, "logps/chosen": -332.79998779296875, "logps/rejected": -345.0, "loss": 0.0956, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8481689691543579, "rewards/margins": 4.878125190734863, "rewards/rejected": -4.028124809265137, "step": 3000 }, { "epoch": 1.5867158671586716, "grad_norm": 49.66738920961612, "learning_rate": 6.034528202424881e-07, "logits/chosen": 0.30769044160842896, "logits/rejected": 0.15694275498390198, "logps/chosen": -339.75, "logps/rejected": -361.3500061035156, "loss": 0.1112, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.909423828125, "rewards/margins": 4.798437595367432, "rewards/rejected": -3.890625, "step": 3010 }, { "epoch": 1.591987348444913, "grad_norm": 40.74532074566243, "learning_rate": 6.021349499209277e-07, "logits/chosen": 0.19285888969898224, "logits/rejected": 0.09548339992761612, "logps/chosen": -361.0, "logps/rejected": -369.04998779296875, "loss": 0.0993, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9881347417831421, "rewards/margins": 4.911718845367432, "rewards/rejected": -3.9242186546325684, "step": 3020 }, { "epoch": 1.5972588297311545, "grad_norm": 110.80158128495022, "learning_rate": 6.008170795993674e-07, "logits/chosen": 0.16987304389476776, "logits/rejected": 0.136993408203125, "logps/chosen": -361.25, "logps/rejected": -385.70001220703125, "loss": 0.14, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7435058355331421, "rewards/margins": 4.686327934265137, "rewards/rejected": -3.9390625953674316, "step": 3030 }, { "epoch": 1.6025303110173958, "grad_norm": 46.522624042257846, "learning_rate": 5.99499209277807e-07, "logits/chosen": 0.23457641899585724, "logits/rejected": 0.18923339247703552, "logps/chosen": -332.45001220703125, "logps/rejected": -347.45001220703125, "loss": 0.1258, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2035644054412842, "rewards/margins": 4.499218940734863, "rewards/rejected": -3.3003907203674316, "step": 3040 }, { "epoch": 1.6078017923036372, "grad_norm": 17.00048467039329, "learning_rate": 5.981813389562466e-07, "logits/chosen": 0.45015257596969604, "logits/rejected": 0.20230713486671448, "logps/chosen": -350.25, "logps/rejected": -354.0, "loss": 0.0777, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.0775878429412842, "rewards/margins": 4.573437690734863, "rewards/rejected": -3.492968797683716, "step": 3050 }, { "epoch": 1.6130732735898787, "grad_norm": 8.570134623446144, "learning_rate": 5.968634686346863e-07, "logits/chosen": 0.26322633028030396, "logits/rejected": 0.12619629502296448, "logps/chosen": -348.6000061035156, "logps/rejected": -389.70001220703125, "loss": 0.0947, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.129614233970642, "rewards/margins": 4.463671684265137, "rewards/rejected": -3.333984375, "step": 3060 }, { "epoch": 1.6183447548761203, "grad_norm": 15.68497869553392, "learning_rate": 5.95545598313126e-07, "logits/chosen": 0.22232666611671448, "logits/rejected": 0.2041480988264084, "logps/chosen": -388.54998779296875, "logps/rejected": -390.5, "loss": 0.1162, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.9703124761581421, "rewards/margins": 4.839062690734863, "rewards/rejected": -3.8687500953674316, "step": 3070 }, { "epoch": 1.6236162361623616, "grad_norm": 25.55445621774485, "learning_rate": 5.942277279915656e-07, "logits/chosen": 0.35364991426467896, "logits/rejected": 0.1348876953125, "logps/chosen": -369.1000061035156, "logps/rejected": -359.54998779296875, "loss": 0.0766, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6556640863418579, "rewards/margins": 4.810156345367432, "rewards/rejected": -4.157812595367432, "step": 3080 }, { "epoch": 1.628887717448603, "grad_norm": 37.61367849116987, "learning_rate": 5.929098576700052e-07, "logits/chosen": 0.06911011040210724, "logits/rejected": 0.02445678785443306, "logps/chosen": -367.75, "logps/rejected": -333.45001220703125, "loss": 0.0979, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.89508056640625, "rewards/margins": 4.665625095367432, "rewards/rejected": -3.772656202316284, "step": 3090 }, { "epoch": 1.6341591987348445, "grad_norm": 13.313441835671053, "learning_rate": 5.915919873484448e-07, "logits/chosen": 0.17125244438648224, "logits/rejected": 0.15015259385108948, "logps/chosen": -377.3500061035156, "logps/rejected": -405.70001220703125, "loss": 0.0684, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.820605456829071, "rewards/margins": 5.342187404632568, "rewards/rejected": -4.517968654632568, "step": 3100 }, { "epoch": 1.639430680021086, "grad_norm": 26.88807515172817, "learning_rate": 5.902741170268845e-07, "logits/chosen": 0.13760986924171448, "logits/rejected": 0.06700439751148224, "logps/chosen": -408.8999938964844, "logps/rejected": -406.3500061035156, "loss": 0.0709, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6676025390625, "rewards/margins": 5.228906154632568, "rewards/rejected": -4.560937404632568, "step": 3110 }, { "epoch": 1.6447021613073274, "grad_norm": 60.92612482320285, "learning_rate": 5.889562467053242e-07, "logits/chosen": 0.11678466945886612, "logits/rejected": -0.02178955078125, "logps/chosen": -325.125, "logps/rejected": -383.6499938964844, "loss": 0.097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.45989990234375, "rewards/margins": 4.995312690734863, "rewards/rejected": -4.536718845367432, "step": 3120 }, { "epoch": 1.6499736425935687, "grad_norm": 33.31532878925389, "learning_rate": 5.876383763837638e-07, "logits/chosen": 0.20257262885570526, "logits/rejected": 0.02620849572122097, "logps/chosen": -336.54998779296875, "logps/rejected": -379.5, "loss": 0.1086, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2762084901332855, "rewards/margins": 4.932031154632568, "rewards/rejected": -4.651562690734863, "step": 3130 }, { "epoch": 1.6552451238798103, "grad_norm": 76.49383449671309, "learning_rate": 5.863205060622034e-07, "logits/chosen": 0.1464080810546875, "logits/rejected": 0.01505126990377903, "logps/chosen": -410.70001220703125, "logps/rejected": -393.3500061035156, "loss": 0.1327, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.8517395257949829, "rewards/margins": 4.786718845367432, "rewards/rejected": -3.9359374046325684, "step": 3140 }, { "epoch": 1.6605166051660518, "grad_norm": 12.767076592625617, "learning_rate": 5.850026357406431e-07, "logits/chosen": 0.32554322481155396, "logits/rejected": 0.16090698540210724, "logps/chosen": -354.20001220703125, "logps/rejected": -376.45001220703125, "loss": 0.1014, "rewards/accuracies": 0.96875, "rewards/chosen": 1.129919409751892, "rewards/margins": 4.504687309265137, "rewards/rejected": -3.373828172683716, "step": 3150 }, { "epoch": 1.6657880864522931, "grad_norm": 22.02721625449822, "learning_rate": 5.836847654190827e-07, "logits/chosen": 0.31129151582717896, "logits/rejected": 0.17782440781593323, "logps/chosen": -392.0, "logps/rejected": -399.29998779296875, "loss": 0.0981, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9996093511581421, "rewards/margins": 4.942187309265137, "rewards/rejected": -3.944531202316284, "step": 3160 }, { "epoch": 1.6710595677385345, "grad_norm": 48.34651411106925, "learning_rate": 5.823668950975223e-07, "logits/chosen": 0.224212646484375, "logits/rejected": 0.121978759765625, "logps/chosen": -358.6499938964844, "logps/rejected": -402.54998779296875, "loss": 0.0812, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8047424554824829, "rewards/margins": 4.839062690734863, "rewards/rejected": -4.032031059265137, "step": 3170 }, { "epoch": 1.676331049024776, "grad_norm": 16.351171755877225, "learning_rate": 5.810490247759621e-07, "logits/chosen": 0.16498413681983948, "logits/rejected": 0.06964264065027237, "logps/chosen": -402.5, "logps/rejected": -421.5, "loss": 0.0808, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.909375011920929, "rewards/margins": 5.408593654632568, "rewards/rejected": -4.5, "step": 3180 }, { "epoch": 1.6816025303110174, "grad_norm": 17.476081872097694, "learning_rate": 5.797311544544017e-07, "logits/chosen": 0.255340576171875, "logits/rejected": 0.06097412109375, "logps/chosen": -362.20001220703125, "logps/rejected": -359.1000061035156, "loss": 0.1024, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.769091784954071, "rewards/margins": 4.978906154632568, "rewards/rejected": -4.210156440734863, "step": 3190 }, { "epoch": 1.6868740115972587, "grad_norm": 37.9703841069287, "learning_rate": 5.784132841328413e-07, "logits/chosen": 0.07777099311351776, "logits/rejected": -0.02768249437212944, "logps/chosen": -403.0, "logps/rejected": -399.29998779296875, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.692919909954071, "rewards/margins": 5.576562404632568, "rewards/rejected": -4.881249904632568, "step": 3200 }, { "epoch": 1.6921454928835002, "grad_norm": 44.9830480700517, "learning_rate": 5.770954138112809e-07, "logits/chosen": 0.16682739555835724, "logits/rejected": 0.04175720363855362, "logps/chosen": -342.20001220703125, "logps/rejected": -319.8500061035156, "loss": 0.1063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.27489012479782104, "rewards/margins": 4.397656440734863, "rewards/rejected": -4.125, "step": 3210 }, { "epoch": 1.6974169741697418, "grad_norm": 45.32513832470945, "learning_rate": 5.757775434897206e-07, "logits/chosen": 0.23352661728858948, "logits/rejected": 0.18868407607078552, "logps/chosen": -372.1000061035156, "logps/rejected": -383.7749938964844, "loss": 0.134, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.571441650390625, "rewards/margins": 4.883593559265137, "rewards/rejected": -4.30859375, "step": 3220 }, { "epoch": 1.7026884554559831, "grad_norm": 8.133339352627504, "learning_rate": 5.744596731681603e-07, "logits/chosen": 0.2361907958984375, "logits/rejected": 0.081268310546875, "logps/chosen": -363.3999938964844, "logps/rejected": -367.54998779296875, "loss": 0.0766, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.081445336341858, "rewards/margins": 5.035937309265137, "rewards/rejected": -3.952343702316284, "step": 3230 }, { "epoch": 1.7079599367422245, "grad_norm": 42.22238655740257, "learning_rate": 5.731418028465999e-07, "logits/chosen": 0.26166993379592896, "logits/rejected": 0.160247802734375, "logps/chosen": -334.8500061035156, "logps/rejected": -347.5, "loss": 0.1014, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.73876953125, "rewards/margins": 5.11328125, "rewards/rejected": -4.367968559265137, "step": 3240 }, { "epoch": 1.713231418028466, "grad_norm": 22.920032985362653, "learning_rate": 5.718239325250395e-07, "logits/chosen": 0.40452879667282104, "logits/rejected": 0.06968383491039276, "logps/chosen": -426.3500061035156, "logps/rejected": -397.5, "loss": 0.0975, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6191650629043579, "rewards/margins": 5.314843654632568, "rewards/rejected": -4.693749904632568, "step": 3250 }, { "epoch": 1.7185028993147076, "grad_norm": 42.78119754637318, "learning_rate": 5.705060622034792e-07, "logits/chosen": 0.0841064453125, "logits/rejected": 0.07111511379480362, "logps/chosen": -354.70001220703125, "logps/rejected": -381.75, "loss": 0.0879, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.467721551656723, "rewards/margins": 5.237500190734863, "rewards/rejected": -4.7734375, "step": 3260 }, { "epoch": 1.723774380600949, "grad_norm": 290.7617678495558, "learning_rate": 5.691881918819188e-07, "logits/chosen": 0.33647459745407104, "logits/rejected": -0.12761230766773224, "logps/chosen": -378.70001220703125, "logps/rejected": -344.70001220703125, "loss": 0.1833, "rewards/accuracies": 0.9375, "rewards/chosen": 0.262786865234375, "rewards/margins": 4.77734375, "rewards/rejected": -4.513281345367432, "step": 3270 }, { "epoch": 1.7290458618871902, "grad_norm": 19.174701872790195, "learning_rate": 5.678703215603584e-07, "logits/chosen": 0.12607422471046448, "logits/rejected": 0.04972534254193306, "logps/chosen": -394.3999938964844, "logps/rejected": -390.1499938964844, "loss": 0.0872, "rewards/accuracies": 0.96875, "rewards/chosen": 0.58392333984375, "rewards/margins": 5.186718940734863, "rewards/rejected": -4.59765625, "step": 3280 }, { "epoch": 1.7343173431734318, "grad_norm": 101.6716537904052, "learning_rate": 5.665524512387981e-07, "logits/chosen": 0.20409850776195526, "logits/rejected": 0.09432373195886612, "logps/chosen": -382.95001220703125, "logps/rejected": -400.1499938964844, "loss": 0.1061, "rewards/accuracies": 0.96875, "rewards/chosen": 0.556933581829071, "rewards/margins": 5.055468559265137, "rewards/rejected": -4.492968559265137, "step": 3290 }, { "epoch": 1.7395888244596733, "grad_norm": 13.725071307047706, "learning_rate": 5.652345809172378e-07, "logits/chosen": 0.3476318418979645, "logits/rejected": 0.16340942680835724, "logps/chosen": -351.70001220703125, "logps/rejected": -364.6000061035156, "loss": 0.1133, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8467041254043579, "rewards/margins": 4.784375190734863, "rewards/rejected": -3.938281297683716, "step": 3300 }, { "epoch": 1.7448603057459144, "grad_norm": 10.321489268492147, "learning_rate": 5.639167105956774e-07, "logits/chosen": 0.2681884765625, "logits/rejected": 0.10307617485523224, "logps/chosen": -344.07501220703125, "logps/rejected": -346.3500061035156, "loss": 0.0921, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8324829339981079, "rewards/margins": 4.642187595367432, "rewards/rejected": -3.8101563453674316, "step": 3310 }, { "epoch": 1.750131787032156, "grad_norm": 28.209523927333105, "learning_rate": 5.62598840274117e-07, "logits/chosen": 0.3971313536167145, "logits/rejected": 0.07885131984949112, "logps/chosen": -382.45001220703125, "logps/rejected": -404.8999938964844, "loss": 0.067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.2401244640350342, "rewards/margins": 5.4140625, "rewards/rejected": -4.17578125, "step": 3320 }, { "epoch": 1.7554032683183975, "grad_norm": 25.768474215714544, "learning_rate": 5.612809699525567e-07, "logits/chosen": 0.20740661025047302, "logits/rejected": 0.04649658128619194, "logps/chosen": -348.25, "logps/rejected": -368.04998779296875, "loss": 0.1041, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.836108386516571, "rewards/margins": 5.114062309265137, "rewards/rejected": -4.279687404632568, "step": 3330 }, { "epoch": 1.7606747496046389, "grad_norm": 27.575915720316814, "learning_rate": 5.599630996309963e-07, "logits/chosen": 0.2828125059604645, "logits/rejected": 0.17938232421875, "logps/chosen": -417.42498779296875, "logps/rejected": -380.75, "loss": 0.1529, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.009374976158142, "rewards/margins": 5.165625095367432, "rewards/rejected": -4.157812595367432, "step": 3340 }, { "epoch": 1.7659462308908802, "grad_norm": 18.76310394948931, "learning_rate": 5.58645229309436e-07, "logits/chosen": 0.21014098823070526, "logits/rejected": 0.14265136420726776, "logps/chosen": -371.29998779296875, "logps/rejected": -378.70001220703125, "loss": 0.0811, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.940478503704071, "rewards/margins": 5.076562404632568, "rewards/rejected": -4.129687309265137, "step": 3350 }, { "epoch": 1.7712177121771218, "grad_norm": 31.69273485214709, "learning_rate": 5.573273589878755e-07, "logits/chosen": 0.07187499850988388, "logits/rejected": 0.004345702938735485, "logps/chosen": -339.1000061035156, "logps/rejected": -385.0, "loss": 0.0784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4255737364292145, "rewards/margins": 5.232812404632568, "rewards/rejected": -4.807031154632568, "step": 3360 }, { "epoch": 1.7764891934633633, "grad_norm": 12.205178662581792, "learning_rate": 5.560094886663152e-07, "logits/chosen": 0.42705076932907104, "logits/rejected": 0.15734252333641052, "logps/chosen": -385.6499938964844, "logps/rejected": -385.3999938964844, "loss": 0.0811, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.565502941608429, "rewards/margins": 5.334374904632568, "rewards/rejected": -4.764843940734863, "step": 3370 }, { "epoch": 1.7817606747496046, "grad_norm": 27.031481324156832, "learning_rate": 5.546916183447548e-07, "logits/chosen": 0.27031248807907104, "logits/rejected": 0.0419921875, "logps/chosen": -414.54998779296875, "logps/rejected": -423.5, "loss": 0.0913, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.608081042766571, "rewards/margins": 5.57421875, "rewards/rejected": -4.969531059265137, "step": 3380 }, { "epoch": 1.787032156035846, "grad_norm": 18.48456072144214, "learning_rate": 5.533737480231944e-07, "logits/chosen": 0.08225402981042862, "logits/rejected": 0.005706787109375, "logps/chosen": -373.5, "logps/rejected": -415.8999938964844, "loss": 0.0644, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3423828184604645, "rewards/margins": 5.109375, "rewards/rejected": -4.766406059265137, "step": 3390 }, { "epoch": 1.7923036373220875, "grad_norm": 60.78362114545018, "learning_rate": 5.520558777016341e-07, "logits/chosen": 0.10427246242761612, "logits/rejected": 0.0592041015625, "logps/chosen": -375.5, "logps/rejected": -408.8500061035156, "loss": 0.0865, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.543548583984375, "rewards/margins": 5.953125, "rewards/rejected": -5.412499904632568, "step": 3400 }, { "epoch": 1.797575118608329, "grad_norm": 13.924594957678021, "learning_rate": 5.507380073800738e-07, "logits/chosen": 0.1739501953125, "logits/rejected": 0.03355712816119194, "logps/chosen": -331.5249938964844, "logps/rejected": -388.1000061035156, "loss": 0.1061, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17618103325366974, "rewards/margins": 5.067187309265137, "rewards/rejected": -4.89453125, "step": 3410 }, { "epoch": 1.8028465998945704, "grad_norm": 37.27812311091622, "learning_rate": 5.494201370585134e-07, "logits/chosen": 0.10746459662914276, "logits/rejected": -0.005841064266860485, "logps/chosen": -364.6499938964844, "logps/rejected": -411.6499938964844, "loss": 0.0736, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.13349609076976776, "rewards/margins": 5.44140625, "rewards/rejected": -5.3125, "step": 3420 }, { "epoch": 1.8081180811808117, "grad_norm": 20.5794750043546, "learning_rate": 5.48102266736953e-07, "logits/chosen": 0.15091553330421448, "logits/rejected": -0.0804443359375, "logps/chosen": -374.6499938964844, "logps/rejected": -389.54998779296875, "loss": 0.0903, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.167022705078125, "rewards/margins": 4.958593845367432, "rewards/rejected": -4.7890625, "step": 3430 }, { "epoch": 1.8133895624670533, "grad_norm": 30.073511427371837, "learning_rate": 5.467843964153926e-07, "logits/chosen": 0.12016601860523224, "logits/rejected": -0.08825989067554474, "logps/chosen": -382.70001220703125, "logps/rejected": -365.25, "loss": 0.1248, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.48443603515625, "rewards/margins": 5.107812404632568, "rewards/rejected": -4.626562595367432, "step": 3440 }, { "epoch": 1.8186610437532946, "grad_norm": 36.250791080583795, "learning_rate": 5.454665260938323e-07, "logits/chosen": 0.10046996921300888, "logits/rejected": 0.07685546576976776, "logps/chosen": -316.25, "logps/rejected": -324.04998779296875, "loss": 0.1445, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.07233276218175888, "rewards/margins": 4.103906154632568, "rewards/rejected": -4.172656059265137, "step": 3450 }, { "epoch": 1.823932525039536, "grad_norm": 63.459348329672444, "learning_rate": 5.44148655772272e-07, "logits/chosen": 0.25900572538375854, "logits/rejected": 0.17260131239891052, "logps/chosen": -367.5, "logps/rejected": -399.20001220703125, "loss": 0.0991, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7074218988418579, "rewards/margins": 5.184374809265137, "rewards/rejected": -4.479296684265137, "step": 3460 }, { "epoch": 1.8292040063257775, "grad_norm": 45.740648502142946, "learning_rate": 5.428307854507116e-07, "logits/chosen": 0.21249084174633026, "logits/rejected": 0.10277099907398224, "logps/chosen": -323.20001220703125, "logps/rejected": -368.70001220703125, "loss": 0.1155, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5352783203125, "rewards/margins": 4.750781059265137, "rewards/rejected": -4.213281154632568, "step": 3470 }, { "epoch": 1.834475487612019, "grad_norm": 63.70181335335642, "learning_rate": 5.415129151291513e-07, "logits/chosen": 0.19239501655101776, "logits/rejected": 0.09752807766199112, "logps/chosen": -376.3999938964844, "logps/rejected": -407.20001220703125, "loss": 0.1066, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.6451171636581421, "rewards/margins": 5.103906154632568, "rewards/rejected": -4.455468654632568, "step": 3480 }, { "epoch": 1.8397469688982604, "grad_norm": 31.440725900001567, "learning_rate": 5.401950448075909e-07, "logits/chosen": 0.22153320908546448, "logits/rejected": 0.24217529594898224, "logps/chosen": -331.3500061035156, "logps/rejected": -365.1000061035156, "loss": 0.0815, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.9798339605331421, "rewards/margins": 5.060156345367432, "rewards/rejected": -4.0859375, "step": 3490 }, { "epoch": 1.8450184501845017, "grad_norm": 42.4873158965656, "learning_rate": 5.388771744860305e-07, "logits/chosen": 0.17583008110523224, "logits/rejected": 0.01055297814309597, "logps/chosen": -301.29998779296875, "logps/rejected": -325.7250061035156, "loss": 0.1358, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.7525879144668579, "rewards/margins": 4.283593654632568, "rewards/rejected": -3.5269532203674316, "step": 3500 }, { "epoch": 1.8502899314707433, "grad_norm": 58.612597425646534, "learning_rate": 5.375593041644701e-07, "logits/chosen": 0.24239501357078552, "logits/rejected": 0.17076416313648224, "logps/chosen": -386.1499938964844, "logps/rejected": -388.1499938964844, "loss": 0.1017, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7552245855331421, "rewards/margins": 5.227343559265137, "rewards/rejected": -4.47265625, "step": 3510 }, { "epoch": 1.8555614127569848, "grad_norm": 50.34074058001582, "learning_rate": 5.362414338429099e-07, "logits/chosen": 0.29060059785842896, "logits/rejected": 0.1016845703125, "logps/chosen": -396.6000061035156, "logps/rejected": -400.6499938964844, "loss": 0.1013, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.906542956829071, "rewards/margins": 5.224218845367432, "rewards/rejected": -4.317187309265137, "step": 3520 }, { "epoch": 1.8608328940432262, "grad_norm": 19.346764055970436, "learning_rate": 5.349235635213495e-07, "logits/chosen": 0.16901855170726776, "logits/rejected": 0.14353027939796448, "logps/chosen": -401.5, "logps/rejected": -404.79998779296875, "loss": 0.0657, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.1896483898162842, "rewards/margins": 5.301562309265137, "rewards/rejected": -4.111718654632568, "step": 3530 }, { "epoch": 1.8661043753294675, "grad_norm": 45.83251125973144, "learning_rate": 5.336056931997891e-07, "logits/chosen": 0.19172362983226776, "logits/rejected": 0.06977538764476776, "logps/chosen": -378.54998779296875, "logps/rejected": -443.3999938964844, "loss": 0.0994, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5469604730606079, "rewards/margins": 5.819531440734863, "rewards/rejected": -5.268750190734863, "step": 3540 }, { "epoch": 1.871375856615709, "grad_norm": 22.556982599362506, "learning_rate": 5.322878228782287e-07, "logits/chosen": 0.055206298828125, "logits/rejected": -0.012799072079360485, "logps/chosen": -405.1000061035156, "logps/rejected": -392.1499938964844, "loss": 0.0593, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.546826183795929, "rewards/margins": 5.536718845367432, "rewards/rejected": -4.987500190734863, "step": 3550 }, { "epoch": 1.8766473379019506, "grad_norm": 30.77746725346084, "learning_rate": 5.309699525566684e-07, "logits/chosen": 0.13547363877296448, "logits/rejected": 0.03846435621380806, "logps/chosen": -345.3999938964844, "logps/rejected": -391.29998779296875, "loss": 0.0883, "rewards/accuracies": 0.96875, "rewards/chosen": 0.33140867948532104, "rewards/margins": 5.332812309265137, "rewards/rejected": -5.000781059265137, "step": 3560 }, { "epoch": 1.881918819188192, "grad_norm": 26.45306424057017, "learning_rate": 5.296520822351081e-07, "logits/chosen": 0.09602050483226776, "logits/rejected": -0.11704101413488388, "logps/chosen": -369.0, "logps/rejected": -360.95001220703125, "loss": 0.0832, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.15360108017921448, "rewards/margins": 4.807812690734863, "rewards/rejected": -4.653124809265137, "step": 3570 }, { "epoch": 1.8871903004744333, "grad_norm": 47.20213467387968, "learning_rate": 5.283342119135477e-07, "logits/chosen": 0.04294433444738388, "logits/rejected": -0.02683715894818306, "logps/chosen": -345.1499938964844, "logps/rejected": -369.29998779296875, "loss": 0.0983, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.08701171725988388, "rewards/margins": 4.948437690734863, "rewards/rejected": -4.859375, "step": 3580 }, { "epoch": 1.8924617817606748, "grad_norm": 26.616601161369754, "learning_rate": 5.270163415919874e-07, "logits/chosen": 0.10646972805261612, "logits/rejected": 0.03239746019244194, "logps/chosen": -332.95001220703125, "logps/rejected": -363.45001220703125, "loss": 0.0913, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05788574367761612, "rewards/margins": 4.926562309265137, "rewards/rejected": -4.868750095367432, "step": 3590 }, { "epoch": 1.8977332630469161, "grad_norm": 61.83188147694914, "learning_rate": 5.25698471270427e-07, "logits/chosen": -0.04537353664636612, "logits/rejected": -0.24464721977710724, "logps/chosen": -352.45001220703125, "logps/rejected": -358.8500061035156, "loss": 0.1636, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07662353664636612, "rewards/margins": 4.835156440734863, "rewards/rejected": -4.911718845367432, "step": 3600 }, { "epoch": 1.9030047443331575, "grad_norm": 12.772663213895974, "learning_rate": 5.243806009488666e-07, "logits/chosen": 0.11697997897863388, "logits/rejected": 0.0821533203125, "logps/chosen": -406.3999938964844, "logps/rejected": -421.5, "loss": 0.0909, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5210815668106079, "rewards/margins": 5.248437404632568, "rewards/rejected": -4.72265625, "step": 3610 }, { "epoch": 1.908276225619399, "grad_norm": 18.745989750239687, "learning_rate": 5.230627306273062e-07, "logits/chosen": 0.19408416748046875, "logits/rejected": -0.02109222486615181, "logps/chosen": -373.5, "logps/rejected": -370.3999938964844, "loss": 0.1151, "rewards/accuracies": 0.96875, "rewards/chosen": 0.815234363079071, "rewards/margins": 5.200781345367432, "rewards/rejected": -4.384375095367432, "step": 3620 }, { "epoch": 1.9135477069056406, "grad_norm": 30.3702190969647, "learning_rate": 5.21744860305746e-07, "logits/chosen": 0.2823242247104645, "logits/rejected": 0.13222655653953552, "logps/chosen": -402.79998779296875, "logps/rejected": -396.8999938964844, "loss": 0.0652, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.729687511920929, "rewards/margins": 4.869531154632568, "rewards/rejected": -4.142968654632568, "step": 3630 }, { "epoch": 1.918819188191882, "grad_norm": 18.574716668452258, "learning_rate": 5.204269899841856e-07, "logits/chosen": 0.3530517518520355, "logits/rejected": -0.02797241136431694, "logps/chosen": -392.07501220703125, "logps/rejected": -391.1000061035156, "loss": 0.1053, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.24371948838233948, "rewards/margins": 5.4375, "rewards/rejected": -5.193749904632568, "step": 3640 }, { "epoch": 1.9240906694781232, "grad_norm": 63.482813875715, "learning_rate": 5.191091196626252e-07, "logits/chosen": 0.12907715141773224, "logits/rejected": 0.013659668155014515, "logps/chosen": -371.70001220703125, "logps/rejected": -400.3500061035156, "loss": 0.1093, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.01746826246380806, "rewards/margins": 5.026562690734863, "rewards/rejected": -5.007031440734863, "step": 3650 }, { "epoch": 1.9293621507643648, "grad_norm": 9.258952400682011, "learning_rate": 5.177912493410648e-07, "logits/chosen": 0.14215698838233948, "logits/rejected": 0.05540161207318306, "logps/chosen": -365.3500061035156, "logps/rejected": -414.79998779296875, "loss": 0.0803, "rewards/accuracies": 0.96875, "rewards/chosen": 0.40791016817092896, "rewards/margins": 5.331250190734863, "rewards/rejected": -4.92578125, "step": 3660 }, { "epoch": 1.9346336320506063, "grad_norm": 39.25542177970944, "learning_rate": 5.164733790195045e-07, "logits/chosen": 0.18989257514476776, "logits/rejected": -0.03632201999425888, "logps/chosen": -399.8500061035156, "logps/rejected": -392.0, "loss": 0.0932, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22810058295726776, "rewards/margins": 5.341406345367432, "rewards/rejected": -5.111718654632568, "step": 3670 }, { "epoch": 1.9399051133368477, "grad_norm": 34.69976382649998, "learning_rate": 5.151555086979441e-07, "logits/chosen": 0.13365478813648224, "logits/rejected": -0.014599609188735485, "logps/chosen": -348.5, "logps/rejected": -386.45001220703125, "loss": 0.1189, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.08167724311351776, "rewards/margins": 4.903906345367432, "rewards/rejected": -4.821875095367432, "step": 3680 }, { "epoch": 1.945176594623089, "grad_norm": 19.662705370553137, "learning_rate": 5.138376383763838e-07, "logits/chosen": -0.03482971340417862, "logits/rejected": -0.03809203952550888, "logps/chosen": -371.8999938964844, "logps/rejected": -378.79998779296875, "loss": 0.078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1585693359375, "rewards/margins": 4.982031345367432, "rewards/rejected": -4.817968845367432, "step": 3690 }, { "epoch": 1.9504480759093306, "grad_norm": 35.15482003372672, "learning_rate": 5.125197680548234e-07, "logits/chosen": 0.1278076171875, "logits/rejected": 0.011318969540297985, "logps/chosen": -407.6000061035156, "logps/rejected": -402.3999938964844, "loss": 0.1118, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.10316161811351776, "rewards/margins": 5.253125190734863, "rewards/rejected": -5.154687404632568, "step": 3700 }, { "epoch": 1.9557195571955721, "grad_norm": 15.145510951392891, "learning_rate": 5.11201897733263e-07, "logits/chosen": 0.02176818810403347, "logits/rejected": -0.08028869330883026, "logps/chosen": -370.95001220703125, "logps/rejected": -425.70001220703125, "loss": 0.067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.29371339082717896, "rewards/margins": 5.305468559265137, "rewards/rejected": -5.004687309265137, "step": 3710 }, { "epoch": 1.9609910384818134, "grad_norm": 47.40646215612196, "learning_rate": 5.098840274117026e-07, "logits/chosen": 0.16463622450828552, "logits/rejected": 0.074432373046875, "logps/chosen": -377.70001220703125, "logps/rejected": -379.3500061035156, "loss": 0.0998, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16524657607078552, "rewards/margins": 4.870312690734863, "rewards/rejected": -4.707812309265137, "step": 3720 }, { "epoch": 1.9662625197680548, "grad_norm": 30.09058612218654, "learning_rate": 5.085661570901422e-07, "logits/chosen": 0.014294433407485485, "logits/rejected": 0.02934570237994194, "logps/chosen": -343.95001220703125, "logps/rejected": -386.75, "loss": 0.0881, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.381002813577652, "rewards/margins": 4.90234375, "rewards/rejected": -4.519921779632568, "step": 3730 }, { "epoch": 1.9715340010542963, "grad_norm": 42.17957399807411, "learning_rate": 5.07248286768582e-07, "logits/chosen": 0.10686340183019638, "logits/rejected": 0.08256836235523224, "logps/chosen": -379.95001220703125, "logps/rejected": -402.79998779296875, "loss": 0.1035, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.31120604276657104, "rewards/margins": 5.296093940734863, "rewards/rejected": -4.98046875, "step": 3740 }, { "epoch": 1.9768054823405377, "grad_norm": 34.3125982245806, "learning_rate": 5.059304164470216e-07, "logits/chosen": 0.09299316257238388, "logits/rejected": -0.07228393852710724, "logps/chosen": -338.0, "logps/rejected": -350.3500061035156, "loss": 0.1042, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4452148377895355, "rewards/margins": 4.682812690734863, "rewards/rejected": -4.235937595367432, "step": 3750 }, { "epoch": 1.982076963626779, "grad_norm": 55.100841291399774, "learning_rate": 5.046125461254612e-07, "logits/chosen": 0.11461181938648224, "logits/rejected": -0.03413086012005806, "logps/chosen": -371.3999938964844, "logps/rejected": -351.45001220703125, "loss": 0.0666, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4450439512729645, "rewards/margins": 5.09375, "rewards/rejected": -4.649218559265137, "step": 3760 }, { "epoch": 1.9873484449130205, "grad_norm": 22.82802259178846, "learning_rate": 5.032946758039008e-07, "logits/chosen": 0.13947907090187073, "logits/rejected": 0.03284912183880806, "logps/chosen": -367.20001220703125, "logps/rejected": -427.1499938964844, "loss": 0.0592, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4097839295864105, "rewards/margins": 5.076562404632568, "rewards/rejected": -4.666406154632568, "step": 3770 }, { "epoch": 1.992619926199262, "grad_norm": 28.62462525530095, "learning_rate": 5.019768054823405e-07, "logits/chosen": 0.026092529296875, "logits/rejected": -0.03395996242761612, "logps/chosen": -361.8999938964844, "logps/rejected": -411.1000061035156, "loss": 0.105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3552612364292145, "rewards/margins": 5.246874809265137, "rewards/rejected": -5.6015625, "step": 3780 }, { "epoch": 1.9978914074855034, "grad_norm": 18.046787721218493, "learning_rate": 5.006589351607801e-07, "logits/chosen": -0.13710784912109375, "logits/rejected": -0.15821532905101776, "logps/chosen": -358.54998779296875, "logps/rejected": -390.6000061035156, "loss": 0.0833, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.272042840719223, "rewards/margins": 5.4765625, "rewards/rejected": -5.747656345367432, "step": 3790 }, { "epoch": 2.0031628887717448, "grad_norm": 1.6113514440282348, "learning_rate": 4.993410648392198e-07, "logits/chosen": 0.12132110446691513, "logits/rejected": -0.17075195908546448, "logps/chosen": -350.1000061035156, "logps/rejected": -371.95001220703125, "loss": 0.0692, "rewards/accuracies": 0.9729167222976685, "rewards/chosen": -0.12206725776195526, "rewards/margins": 6.10546875, "rewards/rejected": -6.2265625, "step": 3800 }, { "epoch": 2.0084343700579863, "grad_norm": 5.819624749199907, "learning_rate": 4.980231945176594e-07, "logits/chosen": 0.0133056640625, "logits/rejected": -0.0723876953125, "logps/chosen": -368.6000061035156, "logps/rejected": -389.0, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.283935546875, "rewards/margins": 6.456250190734863, "rewards/rejected": -6.175000190734863, "step": 3810 }, { "epoch": 2.013705851344228, "grad_norm": 16.988611133673338, "learning_rate": 4.96705324196099e-07, "logits/chosen": -0.06849364936351776, "logits/rejected": -0.4080566465854645, "logps/chosen": -392.20001220703125, "logps/rejected": -404.29998779296875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 0.3604736328125, "rewards/margins": 7.004687309265137, "rewards/rejected": -6.637499809265137, "step": 3820 }, { "epoch": 2.018977332630469, "grad_norm": 2.475695070469691, "learning_rate": 4.953874538745387e-07, "logits/chosen": -0.10117187350988388, "logits/rejected": -0.29173582792282104, "logps/chosen": -375.1499938964844, "logps/rejected": -422.0, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.34931641817092896, "rewards/margins": 7.721875190734863, "rewards/rejected": -8.073437690734863, "step": 3830 }, { "epoch": 2.0242488139167105, "grad_norm": 4.504493439139634, "learning_rate": 4.940695835529783e-07, "logits/chosen": 0.02695312537252903, "logits/rejected": -0.1380660980939865, "logps/chosen": -346.1000061035156, "logps/rejected": -377.04998779296875, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 0.02084960974752903, "rewards/margins": 6.459374904632568, "rewards/rejected": -6.440625190734863, "step": 3840 }, { "epoch": 2.029520295202952, "grad_norm": 3.390621406752865, "learning_rate": 4.92751713231418e-07, "logits/chosen": 0.0681915283203125, "logits/rejected": -0.255776971578598, "logps/chosen": -365.79998779296875, "logps/rejected": -384.6000061035156, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 0.1531982421875, "rewards/margins": 6.59375, "rewards/rejected": -6.442187309265137, "step": 3850 }, { "epoch": 2.0347917764891936, "grad_norm": 11.82793155600285, "learning_rate": 4.914338429098576e-07, "logits/chosen": -0.04677734524011612, "logits/rejected": -0.14215698838233948, "logps/chosen": -385.20001220703125, "logps/rejected": -463.0, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.30732423067092896, "rewards/margins": 7.1640625, "rewards/rejected": -6.856249809265137, "step": 3860 }, { "epoch": 2.0400632577754347, "grad_norm": 7.488088878983383, "learning_rate": 4.901159725882973e-07, "logits/chosen": -0.17464599013328552, "logits/rejected": -0.35338133573532104, "logps/chosen": -355.0, "logps/rejected": -368.1000061035156, "loss": 0.0175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.06944580376148224, "rewards/margins": 6.901562690734863, "rewards/rejected": -6.828125, "step": 3870 }, { "epoch": 2.0453347390616763, "grad_norm": 1.0932251225225593, "learning_rate": 4.88798102266737e-07, "logits/chosen": -0.20891113579273224, "logits/rejected": -0.3120361268520355, "logps/chosen": -366.8999938964844, "logps/rejected": -397.1000061035156, "loss": 0.0204, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.588330090045929, "rewards/margins": 6.96875, "rewards/rejected": -7.557812690734863, "step": 3880 }, { "epoch": 2.050606220347918, "grad_norm": 4.0065364347999575, "learning_rate": 4.874802319451766e-07, "logits/chosen": -0.05396728590130806, "logits/rejected": -0.24599608778953552, "logps/chosen": -400.29998779296875, "logps/rejected": -397.5, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -0.171875, "rewards/margins": 7.270312309265137, "rewards/rejected": -7.4375, "step": 3890 }, { "epoch": 2.0558777016341594, "grad_norm": 4.340292417941958, "learning_rate": 4.861623616236162e-07, "logits/chosen": -0.2322647124528885, "logits/rejected": -0.153289794921875, "logps/chosen": -367.8500061035156, "logps/rejected": -402.8999938964844, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -0.08167114108800888, "rewards/margins": 7.09375, "rewards/rejected": -7.176562309265137, "step": 3900 }, { "epoch": 2.0611491829204005, "grad_norm": 17.774747027520874, "learning_rate": 4.848444913020559e-07, "logits/chosen": -0.177703857421875, "logits/rejected": -0.26514893770217896, "logps/chosen": -376.25, "logps/rejected": -403.29998779296875, "loss": 0.0203, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.4894653260707855, "rewards/margins": 7.339062690734863, "rewards/rejected": -7.8359375, "step": 3910 }, { "epoch": 2.066420664206642, "grad_norm": 3.3176674666400032, "learning_rate": 4.835266209804955e-07, "logits/chosen": -0.11170653998851776, "logits/rejected": -0.348175048828125, "logps/chosen": -365.5, "logps/rejected": -409.5, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.6740356683731079, "rewards/margins": 7.557812690734863, "rewards/rejected": -8.231249809265137, "step": 3920 }, { "epoch": 2.0716921454928836, "grad_norm": 1.9000080000945017, "learning_rate": 4.822087506589351e-07, "logits/chosen": -0.2845703065395355, "logits/rejected": -0.28077393770217896, "logps/chosen": -357.6000061035156, "logps/rejected": -378.95001220703125, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.36091309785842896, "rewards/margins": 7.045312404632568, "rewards/rejected": -7.412499904632568, "step": 3930 }, { "epoch": 2.0769636267791247, "grad_norm": 4.128535623878369, "learning_rate": 4.808908803373748e-07, "logits/chosen": -0.010205077938735485, "logits/rejected": -0.17864379286766052, "logps/chosen": -351.6499938964844, "logps/rejected": -396.6000061035156, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.20820312201976776, "rewards/margins": 7.4765625, "rewards/rejected": -7.265625, "step": 3940 }, { "epoch": 2.0822351080653663, "grad_norm": 20.31623584188594, "learning_rate": 4.795730100158144e-07, "logits/chosen": 0.01441345177590847, "logits/rejected": -0.29531556367874146, "logps/chosen": -356.5, "logps/rejected": -407.29998779296875, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 0.5281738042831421, "rewards/margins": 7.018750190734863, "rewards/rejected": -6.489062309265137, "step": 3950 }, { "epoch": 2.087506589351608, "grad_norm": 11.078762677784905, "learning_rate": 4.782551396942541e-07, "logits/chosen": -0.05563964694738388, "logits/rejected": -0.21535034477710724, "logps/chosen": -326.5, "logps/rejected": -420.29998779296875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 0.43071287870407104, "rewards/margins": 7.292187690734863, "rewards/rejected": -6.865624904632568, "step": 3960 }, { "epoch": 2.0927780706378494, "grad_norm": 2.8816708765506127, "learning_rate": 4.769372693726937e-07, "logits/chosen": -0.10352782905101776, "logits/rejected": -0.2750000059604645, "logps/chosen": -400.3500061035156, "logps/rejected": -439.1000061035156, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.09221191704273224, "rewards/margins": 7.159375190734863, "rewards/rejected": -7.060937404632568, "step": 3970 }, { "epoch": 2.0980495519240905, "grad_norm": 3.550693045843966, "learning_rate": 4.756193990511334e-07, "logits/chosen": -0.17690429091453552, "logits/rejected": -0.40256959199905396, "logps/chosen": -319.8999938964844, "logps/rejected": -406.29998779296875, "loss": 0.0136, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5372558832168579, "rewards/margins": 7.171875, "rewards/rejected": -7.709374904632568, "step": 3980 }, { "epoch": 2.103321033210332, "grad_norm": 25.074534148906636, "learning_rate": 4.7430152872957297e-07, "logits/chosen": -0.13527221977710724, "logits/rejected": -0.2740722596645355, "logps/chosen": -367.0, "logps/rejected": -451.0, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -0.716796875, "rewards/margins": 7.415625095367432, "rewards/rejected": -8.125, "step": 3990 }, { "epoch": 2.1085925144965736, "grad_norm": 1.582783817434099, "learning_rate": 4.729836584080126e-07, "logits/chosen": -0.0875396728515625, "logits/rejected": -0.39739990234375, "logps/chosen": -393.29998779296875, "logps/rejected": -416.8999938964844, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.08500976860523224, "rewards/margins": 7.339062690734863, "rewards/rejected": -7.254687309265137, "step": 4000 }, { "epoch": 2.113863995782815, "grad_norm": 10.853310670317924, "learning_rate": 4.716657880864523e-07, "logits/chosen": -0.13157959282398224, "logits/rejected": -0.44143906235694885, "logps/chosen": -398.75, "logps/rejected": -380.70001220703125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -0.23560485243797302, "rewards/margins": 7.165625095367432, "rewards/rejected": -7.407812595367432, "step": 4010 }, { "epoch": 2.1191354770690563, "grad_norm": 3.2334175468434765, "learning_rate": 4.703479177648919e-07, "logits/chosen": -0.13892212510108948, "logits/rejected": -0.30622559785842896, "logps/chosen": -346.6000061035156, "logps/rejected": -439.70001220703125, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.04130859300494194, "rewards/margins": 7.668749809265137, "rewards/rejected": -7.7109375, "step": 4020 }, { "epoch": 2.124406958355298, "grad_norm": 5.572083275621511, "learning_rate": 4.6903004744333156e-07, "logits/chosen": -0.15053100883960724, "logits/rejected": -0.4593261778354645, "logps/chosen": -378.1499938964844, "logps/rejected": -392.29998779296875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 0.064208984375, "rewards/margins": 7.443749904632568, "rewards/rejected": -7.378125190734863, "step": 4030 }, { "epoch": 2.1296784396415394, "grad_norm": 13.489807808465777, "learning_rate": 4.677121771217712e-07, "logits/chosen": -0.15411376953125, "logits/rejected": -0.208251953125, "logps/chosen": -420.29998779296875, "logps/rejected": -449.29998779296875, "loss": 0.0178, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.226165771484375, "rewards/margins": 7.176562309265137, "rewards/rejected": -7.404687404632568, "step": 4040 }, { "epoch": 2.134949920927781, "grad_norm": 1.9248334963738798, "learning_rate": 4.6639430680021086e-07, "logits/chosen": -0.24826660752296448, "logits/rejected": -0.46489256620407104, "logps/chosen": -350.1499938964844, "logps/rejected": -403.29998779296875, "loss": 0.0252, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.29603272676467896, "rewards/margins": 7.590624809265137, "rewards/rejected": -7.887499809265137, "step": 4050 }, { "epoch": 2.140221402214022, "grad_norm": 6.988441589537027, "learning_rate": 4.6507643647865045e-07, "logits/chosen": -0.23914185166358948, "logits/rejected": -0.41668701171875, "logps/chosen": -375.8500061035156, "logps/rejected": -456.8999938964844, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.7256103754043579, "rewards/margins": 7.959374904632568, "rewards/rejected": -8.693750381469727, "step": 4060 }, { "epoch": 2.1454928835002636, "grad_norm": 12.044218393552931, "learning_rate": 4.6375856615709015e-07, "logits/chosen": -0.22490234673023224, "logits/rejected": -0.32935792207717896, "logps/chosen": -359.04998779296875, "logps/rejected": -406.1000061035156, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.46381837129592896, "rewards/margins": 7.153124809265137, "rewards/rejected": -7.618750095367432, "step": 4070 }, { "epoch": 2.150764364786505, "grad_norm": 1.5343342582116544, "learning_rate": 4.6244069583552975e-07, "logits/chosen": -0.16362304985523224, "logits/rejected": -0.5257812738418579, "logps/chosen": -371.0, "logps/rejected": -384.8999938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.9093017578125, "rewards/margins": 7.248437404632568, "rewards/rejected": -8.1640625, "step": 4080 }, { "epoch": 2.1560358460727462, "grad_norm": 5.652867729899026, "learning_rate": 4.611228255139694e-07, "logits/chosen": -0.291909396648407, "logits/rejected": -0.5101470947265625, "logps/chosen": -433.54998779296875, "logps/rejected": -441.79998779296875, "loss": 0.0168, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.869067370891571, "rewards/margins": 7.465624809265137, "rewards/rejected": -8.3359375, "step": 4090 }, { "epoch": 2.161307327358988, "grad_norm": 2.5872378262664286, "learning_rate": 4.5980495519240904e-07, "logits/chosen": -0.23640136420726776, "logits/rejected": -0.5554138422012329, "logps/chosen": -428.20001220703125, "logps/rejected": -448.0, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.703784167766571, "rewards/margins": 7.748437404632568, "rewards/rejected": -8.449999809265137, "step": 4100 }, { "epoch": 2.1665788086452293, "grad_norm": 5.452578565062607, "learning_rate": 4.584870848708487e-07, "logits/chosen": -0.19808349013328552, "logits/rejected": -0.4637084901332855, "logps/chosen": -386.7250061035156, "logps/rejected": -420.70001220703125, "loss": 0.012, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.4522857666015625, "rewards/margins": 7.3359375, "rewards/rejected": -7.787499904632568, "step": 4110 }, { "epoch": 2.171850289931471, "grad_norm": 16.12422640560244, "learning_rate": 4.571692145492883e-07, "logits/chosen": -0.09543456882238388, "logits/rejected": -0.23878173530101776, "logps/chosen": -340.20001220703125, "logps/rejected": -422.8500061035156, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.24553222954273224, "rewards/margins": 7.379687309265137, "rewards/rejected": -7.626562595367432, "step": 4120 }, { "epoch": 2.177121771217712, "grad_norm": 3.460923522219188, "learning_rate": 4.55851344227728e-07, "logits/chosen": -0.11655273288488388, "logits/rejected": -0.4331420958042145, "logps/chosen": -359.54998779296875, "logps/rejected": -425.20001220703125, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.520751953125, "rewards/margins": 8.015625, "rewards/rejected": -8.540624618530273, "step": 4130 }, { "epoch": 2.1823932525039536, "grad_norm": 11.785628451219253, "learning_rate": 4.545334739061676e-07, "logits/chosen": -0.30079346895217896, "logits/rejected": -0.44251710176467896, "logps/chosen": -349.25, "logps/rejected": -431.95001220703125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -0.5932983160018921, "rewards/margins": 7.178124904632568, "rewards/rejected": -7.771874904632568, "step": 4140 }, { "epoch": 2.187664733790195, "grad_norm": 2.7301805294517605, "learning_rate": 4.532156035846073e-07, "logits/chosen": -0.28570556640625, "logits/rejected": -0.4175781309604645, "logps/chosen": -408.95001220703125, "logps/rejected": -426.1499938964844, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7538086175918579, "rewards/margins": 7.535937309265137, "rewards/rejected": -8.296875, "step": 4150 }, { "epoch": 2.1929362150764367, "grad_norm": 1.2099639043085022, "learning_rate": 4.5189773326304693e-07, "logits/chosen": -0.14647407829761505, "logits/rejected": -0.3932861387729645, "logps/chosen": -376.25, "logps/rejected": -414.45001220703125, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -0.8965820074081421, "rewards/margins": 6.775000095367432, "rewards/rejected": -7.673437595367432, "step": 4160 }, { "epoch": 2.1982076963626778, "grad_norm": 37.98278454260939, "learning_rate": 4.505798629414865e-07, "logits/chosen": -0.30976563692092896, "logits/rejected": -0.50634765625, "logps/chosen": -403.54998779296875, "logps/rejected": -417.1499938964844, "loss": 0.0179, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6734619140625, "rewards/margins": 7.370312690734863, "rewards/rejected": -8.0390625, "step": 4170 }, { "epoch": 2.2034791776489193, "grad_norm": 2.3777030532815724, "learning_rate": 4.492619926199262e-07, "logits/chosen": -0.2952880859375, "logits/rejected": -0.49214476346969604, "logps/chosen": -369.6499938964844, "logps/rejected": -391.70001220703125, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -0.615740954875946, "rewards/margins": 7.246874809265137, "rewards/rejected": -7.860937595367432, "step": 4180 }, { "epoch": 2.208750658935161, "grad_norm": 18.385892516987578, "learning_rate": 4.479441222983658e-07, "logits/chosen": -0.09102783352136612, "logits/rejected": -0.3756957948207855, "logps/chosen": -403.3500061035156, "logps/rejected": -414.29998779296875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -0.2566162049770355, "rewards/margins": 7.5390625, "rewards/rejected": -7.795312404632568, "step": 4190 }, { "epoch": 2.2140221402214024, "grad_norm": 17.92275620067919, "learning_rate": 4.4662625197680546e-07, "logits/chosen": -0.192779541015625, "logits/rejected": -0.3813842833042145, "logps/chosen": -363.29998779296875, "logps/rejected": -389.6499938964844, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7840820550918579, "rewards/margins": 7.221875190734863, "rewards/rejected": -8.003125190734863, "step": 4200 }, { "epoch": 2.2192936215076435, "grad_norm": 14.553915599040469, "learning_rate": 4.453083816552451e-07, "logits/chosen": -0.26203614473342896, "logits/rejected": -0.39960938692092896, "logps/chosen": -399.54998779296875, "logps/rejected": -440.8999938964844, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8463379144668579, "rewards/margins": 7.753125190734863, "rewards/rejected": -8.595312118530273, "step": 4210 }, { "epoch": 2.224565102793885, "grad_norm": 1.6217552626761258, "learning_rate": 4.4399051133368476e-07, "logits/chosen": -0.17344817519187927, "logits/rejected": -0.38139647245407104, "logps/chosen": -397.45001220703125, "logps/rejected": -444.0, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.48414307832717896, "rewards/margins": 7.871874809265137, "rewards/rejected": -8.359375, "step": 4220 }, { "epoch": 2.2298365840801266, "grad_norm": 4.419679155781536, "learning_rate": 4.4267264101212435e-07, "logits/chosen": -0.0999755859375, "logits/rejected": -0.3412231504917145, "logps/chosen": -374.8999938964844, "logps/rejected": -437.5, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -0.5093139410018921, "rewards/margins": 7.809374809265137, "rewards/rejected": -8.317187309265137, "step": 4230 }, { "epoch": 2.2351080653663677, "grad_norm": 21.17546658347682, "learning_rate": 4.4135477069056405e-07, "logits/chosen": -0.2932372987270355, "logits/rejected": -0.590771496295929, "logps/chosen": -376.75, "logps/rejected": -427.20001220703125, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -1.6397705078125, "rewards/margins": 7.9453125, "rewards/rejected": -9.582812309265137, "step": 4240 }, { "epoch": 2.2403795466526093, "grad_norm": 9.349533359615851, "learning_rate": 4.4003690036900365e-07, "logits/chosen": -0.17473144829273224, "logits/rejected": -0.49162596464157104, "logps/chosen": -328.6499938964844, "logps/rejected": -422.6000061035156, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.3765747547149658, "rewards/margins": 8.251562118530273, "rewards/rejected": -9.631250381469727, "step": 4250 }, { "epoch": 2.245651027938851, "grad_norm": 32.071583229774504, "learning_rate": 4.387190300474433e-07, "logits/chosen": -0.05718383938074112, "logits/rejected": -0.49506837129592896, "logps/chosen": -388.45001220703125, "logps/rejected": -428.5, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -1.4778320789337158, "rewards/margins": 8.0234375, "rewards/rejected": -9.501562118530273, "step": 4260 }, { "epoch": 2.2509225092250924, "grad_norm": 15.69657570513751, "learning_rate": 4.3740115972588294e-07, "logits/chosen": -0.2761474549770355, "logits/rejected": -0.4370483458042145, "logps/chosen": -384.29998779296875, "logps/rejected": -455.5, "loss": 0.0199, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5139648914337158, "rewards/margins": 7.754687309265137, "rewards/rejected": -9.2734375, "step": 4270 }, { "epoch": 2.2561939905113335, "grad_norm": 4.217378859892804, "learning_rate": 4.360832894043226e-07, "logits/chosen": -0.17087706923484802, "logits/rejected": -0.4150451719760895, "logps/chosen": -354.3999938964844, "logps/rejected": -436.70001220703125, "loss": 0.0156, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.209375023841858, "rewards/margins": 7.482812404632568, "rewards/rejected": -8.696874618530273, "step": 4280 }, { "epoch": 2.261465471797575, "grad_norm": 3.7153258061893735, "learning_rate": 4.3476541908276224e-07, "logits/chosen": -0.20993652939796448, "logits/rejected": -0.31267088651657104, "logps/chosen": -363.20001220703125, "logps/rejected": -451.8999938964844, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.05427246168255806, "rewards/margins": 8.003125190734863, "rewards/rejected": -8.051562309265137, "step": 4290 }, { "epoch": 2.2667369530838166, "grad_norm": 8.524387312625969, "learning_rate": 4.334475487612019e-07, "logits/chosen": -0.12381591647863388, "logits/rejected": -0.36658936738967896, "logps/chosen": -371.04998779296875, "logps/rejected": -402.75, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.722949206829071, "rewards/margins": 7.5703125, "rewards/rejected": -8.295312881469727, "step": 4300 }, { "epoch": 2.272008434370058, "grad_norm": 1.582996489405592, "learning_rate": 4.3212967843964153e-07, "logits/chosen": -0.27581787109375, "logits/rejected": -0.4131835997104645, "logps/chosen": -388.25, "logps/rejected": -440.5, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -0.8390136957168579, "rewards/margins": 7.740624904632568, "rewards/rejected": -8.5859375, "step": 4310 }, { "epoch": 2.2772799156562993, "grad_norm": 39.0476167142262, "learning_rate": 4.3081180811808113e-07, "logits/chosen": -0.26965636014938354, "logits/rejected": -0.35773926973342896, "logps/chosen": -360.70001220703125, "logps/rejected": -429.95001220703125, "loss": 0.0265, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9123169183731079, "rewards/margins": 8.282812118530273, "rewards/rejected": -9.196874618530273, "step": 4320 }, { "epoch": 2.282551396942541, "grad_norm": 5.901189180857347, "learning_rate": 4.2949393779652083e-07, "logits/chosen": -0.20180663466453552, "logits/rejected": -0.3255371153354645, "logps/chosen": -413.75, "logps/rejected": -446.0, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -0.7433105707168579, "rewards/margins": 8.004687309265137, "rewards/rejected": -8.745312690734863, "step": 4330 }, { "epoch": 2.2878228782287824, "grad_norm": 3.3993709675826658, "learning_rate": 4.281760674749604e-07, "logits/chosen": -0.11517486721277237, "logits/rejected": -0.2964843809604645, "logps/chosen": -386.25, "logps/rejected": -405.3999938964844, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -0.72625732421875, "rewards/margins": 7.154687404632568, "rewards/rejected": -7.884375095367432, "step": 4340 }, { "epoch": 2.293094359515024, "grad_norm": 29.467407139830847, "learning_rate": 4.268581971534001e-07, "logits/chosen": -0.18720093369483948, "logits/rejected": -0.28047484159469604, "logps/chosen": -357.25, "logps/rejected": -394.25, "loss": 0.0207, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.13303832709789276, "rewards/margins": 7.376562595367432, "rewards/rejected": -7.509375095367432, "step": 4350 }, { "epoch": 2.298365840801265, "grad_norm": 3.62067532123373, "learning_rate": 4.255403268318397e-07, "logits/chosen": -0.14404296875, "logits/rejected": -0.3619140684604645, "logps/chosen": -345.5, "logps/rejected": -388.3999938964844, "loss": 0.0123, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.22912903130054474, "rewards/margins": 7.540625095367432, "rewards/rejected": -7.765625, "step": 4360 }, { "epoch": 2.3036373220875066, "grad_norm": 3.4662793655756157, "learning_rate": 4.2422245651027937e-07, "logits/chosen": -0.2765869200229645, "logits/rejected": -0.31446534395217896, "logps/chosen": -373.5, "logps/rejected": -432.8500061035156, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.4785400331020355, "rewards/margins": 7.667187690734863, "rewards/rejected": -8.1484375, "step": 4370 }, { "epoch": 2.308908803373748, "grad_norm": 6.471189708077287, "learning_rate": 4.22904586188719e-07, "logits/chosen": -0.2697082459926605, "logits/rejected": -0.4768310487270355, "logps/chosen": -405.25, "logps/rejected": -452.6000061035156, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.7754440307617188, "rewards/margins": 7.696875095367432, "rewards/rejected": -8.4765625, "step": 4380 }, { "epoch": 2.3141802846599893, "grad_norm": 10.910667326754142, "learning_rate": 4.2158671586715866e-07, "logits/chosen": -0.2723388671875, "logits/rejected": -0.38941651582717896, "logps/chosen": -394.8500061035156, "logps/rejected": -427.6000061035156, "loss": 0.0153, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.777478039264679, "rewards/margins": 7.551562309265137, "rewards/rejected": -8.324999809265137, "step": 4390 }, { "epoch": 2.319451765946231, "grad_norm": 3.684818816127233, "learning_rate": 4.2026884554559826e-07, "logits/chosen": -0.27238768339157104, "logits/rejected": -0.5626465082168579, "logps/chosen": -366.79998779296875, "logps/rejected": -412.70001220703125, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.49994200468063354, "rewards/margins": 7.842187404632568, "rewards/rejected": -8.348437309265137, "step": 4400 }, { "epoch": 2.3247232472324724, "grad_norm": 5.129192844635888, "learning_rate": 4.1895097522403796e-07, "logits/chosen": -0.16806641221046448, "logits/rejected": -0.3578124940395355, "logps/chosen": -395.04998779296875, "logps/rejected": -425.3999938964844, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -0.42695313692092896, "rewards/margins": 7.734375, "rewards/rejected": -8.162500381469727, "step": 4410 }, { "epoch": 2.329994728518714, "grad_norm": 9.170719951192334, "learning_rate": 4.1763310490247755e-07, "logits/chosen": -0.2827819883823395, "logits/rejected": -0.378775030374527, "logps/chosen": -380.20001220703125, "logps/rejected": -435.6000061035156, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.854296863079071, "rewards/margins": 7.7734375, "rewards/rejected": -8.623437881469727, "step": 4420 }, { "epoch": 2.335266209804955, "grad_norm": 6.354842421511502, "learning_rate": 4.163152345809172e-07, "logits/chosen": -0.18536376953125, "logits/rejected": -0.2655029296875, "logps/chosen": -404.70001220703125, "logps/rejected": -483.20001220703125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -0.7341552972793579, "rewards/margins": 7.978125095367432, "rewards/rejected": -8.720312118530273, "step": 4430 }, { "epoch": 2.3405376910911966, "grad_norm": 2.4967666877727965, "learning_rate": 4.149973642593569e-07, "logits/chosen": -0.11610718071460724, "logits/rejected": -0.46367186307907104, "logps/chosen": -410.8999938964844, "logps/rejected": -425.20001220703125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.80126953125, "rewards/margins": 7.7265625, "rewards/rejected": -8.528124809265137, "step": 4440 }, { "epoch": 2.345809172377438, "grad_norm": 8.618575396814487, "learning_rate": 4.136794939377965e-07, "logits/chosen": -0.24862059950828552, "logits/rejected": -0.517529308795929, "logps/chosen": -346.6000061035156, "logps/rejected": -418.79998779296875, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.2659180164337158, "rewards/margins": 7.760937690734863, "rewards/rejected": -9.021875381469727, "step": 4450 }, { "epoch": 2.3510806536636797, "grad_norm": 1.7208829308644153, "learning_rate": 4.1236162361623614e-07, "logits/chosen": -0.31683349609375, "logits/rejected": -0.5711517333984375, "logps/chosen": -402.8999938964844, "logps/rejected": -425.0, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.1553833484649658, "rewards/margins": 8.234375, "rewards/rejected": -9.390625, "step": 4460 }, { "epoch": 2.356352134949921, "grad_norm": 10.499810792651802, "learning_rate": 4.110437532946758e-07, "logits/chosen": -0.37510985136032104, "logits/rejected": -0.4481567442417145, "logps/chosen": -377.25, "logps/rejected": -433.79998779296875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.438330054283142, "rewards/margins": 8.149999618530273, "rewards/rejected": -9.584375381469727, "step": 4470 }, { "epoch": 2.3616236162361623, "grad_norm": 9.266031477204587, "learning_rate": 4.0972588297311544e-07, "logits/chosen": -0.42424315214157104, "logits/rejected": -0.5694335699081421, "logps/chosen": -364.1000061035156, "logps/rejected": -460.20001220703125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -1.337133765220642, "rewards/margins": 8.300000190734863, "rewards/rejected": -9.631250381469727, "step": 4480 }, { "epoch": 2.366895097522404, "grad_norm": 48.135636778973186, "learning_rate": 4.0840801265155503e-07, "logits/chosen": -0.3443359434604645, "logits/rejected": -0.5763183832168579, "logps/chosen": -363.6499938964844, "logps/rejected": -405.75, "loss": 0.0205, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.057275414466858, "rewards/margins": 8.026562690734863, "rewards/rejected": -9.082812309265137, "step": 4490 }, { "epoch": 2.3721665788086455, "grad_norm": 3.7092282089919735, "learning_rate": 4.0709014232999473e-07, "logits/chosen": -0.147064208984375, "logits/rejected": -0.49345701932907104, "logps/chosen": -397.1499938964844, "logps/rejected": -416.1499938964844, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -0.9048827886581421, "rewards/margins": 7.651562690734863, "rewards/rejected": -8.559374809265137, "step": 4500 }, { "epoch": 2.3774380600948866, "grad_norm": 3.7584368696610193, "learning_rate": 4.0577227200843433e-07, "logits/chosen": -0.13027343153953552, "logits/rejected": -0.582928478717804, "logps/chosen": -417.1000061035156, "logps/rejected": -457.70001220703125, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -0.40971678495407104, "rewards/margins": 8.462499618530273, "rewards/rejected": -8.871874809265137, "step": 4510 }, { "epoch": 2.382709541381128, "grad_norm": 3.1888057757689197, "learning_rate": 4.0445440168687403e-07, "logits/chosen": -0.3445495665073395, "logits/rejected": -0.57080078125, "logps/chosen": -340.54998779296875, "logps/rejected": -402.25, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -1.093603491783142, "rewards/margins": 7.685937404632568, "rewards/rejected": -8.782812118530273, "step": 4520 }, { "epoch": 2.3879810226673697, "grad_norm": 14.385607387945566, "learning_rate": 4.031365313653136e-07, "logits/chosen": -0.22826537489891052, "logits/rejected": -0.584210216999054, "logps/chosen": -369.20001220703125, "logps/rejected": -418.70001220703125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -1.59033203125, "rewards/margins": 7.623437404632568, "rewards/rejected": -9.220312118530273, "step": 4530 }, { "epoch": 2.3932525039536108, "grad_norm": 3.0110282301344546, "learning_rate": 4.0181866104375327e-07, "logits/chosen": -0.22529296576976776, "logits/rejected": -0.399444580078125, "logps/chosen": -361.45001220703125, "logps/rejected": -456.6000061035156, "loss": 0.0177, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.328515648841858, "rewards/margins": 8.298437118530273, "rewards/rejected": -9.634374618530273, "step": 4540 }, { "epoch": 2.3985239852398523, "grad_norm": 43.81983948790104, "learning_rate": 4.005007907221929e-07, "logits/chosen": -0.31610107421875, "logits/rejected": -0.4532470703125, "logps/chosen": -377.6499938964844, "logps/rejected": -434.29998779296875, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -1.412695288658142, "rewards/margins": 7.8203125, "rewards/rejected": -9.232812881469727, "step": 4550 }, { "epoch": 2.403795466526094, "grad_norm": 2.79207991103157, "learning_rate": 3.9918292040063256e-07, "logits/chosen": -0.25982666015625, "logits/rejected": -0.4123779237270355, "logps/chosen": -365.8500061035156, "logps/rejected": -415.79998779296875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -1.016259789466858, "rewards/margins": 7.912499904632568, "rewards/rejected": -8.931249618530273, "step": 4560 }, { "epoch": 2.4090669478123354, "grad_norm": 1.79426427907428, "learning_rate": 3.978650500790722e-07, "logits/chosen": -0.22843018174171448, "logits/rejected": -0.4244140684604645, "logps/chosen": -406.5, "logps/rejected": -483.5, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.520703136920929, "rewards/margins": 8.462499618530273, "rewards/rejected": -8.975000381469727, "step": 4570 }, { "epoch": 2.4143384290985765, "grad_norm": 6.34150905949501, "learning_rate": 3.9654717975751186e-07, "logits/chosen": -0.21171875298023224, "logits/rejected": -0.5549682378768921, "logps/chosen": -384.45001220703125, "logps/rejected": -443.20001220703125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.43214112520217896, "rewards/margins": 7.746874809265137, "rewards/rejected": -8.173437118530273, "step": 4580 }, { "epoch": 2.419609910384818, "grad_norm": 5.630512091741282, "learning_rate": 3.952293094359515e-07, "logits/chosen": -0.30780029296875, "logits/rejected": -0.458261102437973, "logps/chosen": -350.25, "logps/rejected": -424.1000061035156, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.126708984375, "rewards/margins": 7.415625095367432, "rewards/rejected": -8.543749809265137, "step": 4590 }, { "epoch": 2.4248813916710596, "grad_norm": 6.5760696419474876, "learning_rate": 3.939114391143911e-07, "logits/chosen": -0.30469971895217896, "logits/rejected": -0.4174438416957855, "logps/chosen": -382.1000061035156, "logps/rejected": -426.0, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -1.1200683116912842, "rewards/margins": 7.457812309265137, "rewards/rejected": -8.582812309265137, "step": 4600 }, { "epoch": 2.4301528729573008, "grad_norm": 8.189163588335342, "learning_rate": 3.925935687928308e-07, "logits/chosen": -0.35761719942092896, "logits/rejected": -0.512939453125, "logps/chosen": -379.95001220703125, "logps/rejected": -435.29998779296875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.7496093511581421, "rewards/margins": 8.009374618530273, "rewards/rejected": -8.756250381469727, "step": 4610 }, { "epoch": 2.4354243542435423, "grad_norm": 17.088883625384916, "learning_rate": 3.912756984712704e-07, "logits/chosen": -0.2911376953125, "logits/rejected": -0.519457995891571, "logps/chosen": -398.95001220703125, "logps/rejected": -435.3999938964844, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.9731689691543579, "rewards/margins": 7.465624809265137, "rewards/rejected": -8.4375, "step": 4620 }, { "epoch": 2.440695835529784, "grad_norm": 0.9918485457385189, "learning_rate": 3.8995782814971004e-07, "logits/chosen": -0.44453126192092896, "logits/rejected": -0.595654308795929, "logps/chosen": -338.3500061035156, "logps/rejected": -375.6000061035156, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.857836902141571, "rewards/margins": 7.731249809265137, "rewards/rejected": -8.595312118530273, "step": 4630 }, { "epoch": 2.4459673168160254, "grad_norm": 0.8957182200105219, "learning_rate": 3.886399578281497e-07, "logits/chosen": -0.23833923041820526, "logits/rejected": -0.4813476502895355, "logps/chosen": -380.8500061035156, "logps/rejected": -420.29998779296875, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -0.895458996295929, "rewards/margins": 7.576562404632568, "rewards/rejected": -8.470312118530273, "step": 4640 }, { "epoch": 2.451238798102267, "grad_norm": 7.0146071571349085, "learning_rate": 3.8732208750658934e-07, "logits/chosen": -0.14305420219898224, "logits/rejected": -0.504321277141571, "logps/chosen": -394.125, "logps/rejected": -438.25, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.178369164466858, "rewards/margins": 7.59375, "rewards/rejected": -8.770312309265137, "step": 4650 }, { "epoch": 2.456510279388508, "grad_norm": 1.9757552230439581, "learning_rate": 3.8600421718502893e-07, "logits/chosen": -0.29420775175094604, "logits/rejected": -0.536633312702179, "logps/chosen": -390.5, "logps/rejected": -400.29998779296875, "loss": 0.0318, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.985119640827179, "rewards/margins": 7.721875190734863, "rewards/rejected": -8.701562881469727, "step": 4660 }, { "epoch": 2.4617817606747496, "grad_norm": 3.402348756266666, "learning_rate": 3.8468634686346863e-07, "logits/chosen": -0.17346802353858948, "logits/rejected": -0.30505067110061646, "logps/chosen": -386.45001220703125, "logps/rejected": -435.20001220703125, "loss": 0.0118, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.879150390625, "rewards/margins": 7.846875190734863, "rewards/rejected": -8.7265625, "step": 4670 }, { "epoch": 2.467053241960991, "grad_norm": 4.674091845214333, "learning_rate": 3.8336847654190823e-07, "logits/chosen": -0.3859497010707855, "logits/rejected": -0.546185314655304, "logps/chosen": -377.95001220703125, "logps/rejected": -386.6000061035156, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -1.151953101158142, "rewards/margins": 7.560937404632568, "rewards/rejected": -8.717187881469727, "step": 4680 }, { "epoch": 2.4723247232472323, "grad_norm": 7.843818875583583, "learning_rate": 3.8205060622034793e-07, "logits/chosen": -0.365509033203125, "logits/rejected": -0.505328357219696, "logps/chosen": -338.3500061035156, "logps/rejected": -408.6000061035156, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.935351550579071, "rewards/margins": 7.860937595367432, "rewards/rejected": -8.796875, "step": 4690 }, { "epoch": 2.477596204533474, "grad_norm": 1.9654564419820901, "learning_rate": 3.807327358987875e-07, "logits/chosen": -0.15504150092601776, "logits/rejected": -0.44853514432907104, "logps/chosen": -434.0, "logps/rejected": -439.79998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.6263183355331421, "rewards/margins": 8.126562118530273, "rewards/rejected": -8.754687309265137, "step": 4700 }, { "epoch": 2.4828676858197154, "grad_norm": 7.058617841181233, "learning_rate": 3.7941486557722717e-07, "logits/chosen": -0.42829591035842896, "logits/rejected": -0.5163818597793579, "logps/chosen": -343.8500061035156, "logps/rejected": -400.6000061035156, "loss": 0.0236, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9739745855331421, "rewards/margins": 7.690625190734863, "rewards/rejected": -8.662500381469727, "step": 4710 }, { "epoch": 2.488139167105957, "grad_norm": 2.275425909537999, "learning_rate": 3.7809699525566687e-07, "logits/chosen": -0.32402342557907104, "logits/rejected": -0.4977172911167145, "logps/chosen": -354.8999938964844, "logps/rejected": -398.45001220703125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.8499511480331421, "rewards/margins": 8.073437690734863, "rewards/rejected": -8.926562309265137, "step": 4720 }, { "epoch": 2.493410648392198, "grad_norm": 1.8523840013393105, "learning_rate": 3.7677912493410647e-07, "logits/chosen": -0.4079833924770355, "logits/rejected": -0.6597656011581421, "logps/chosen": -369.1000061035156, "logps/rejected": -405.8999938964844, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -1.437902808189392, "rewards/margins": 7.296875, "rewards/rejected": -8.745312690734863, "step": 4730 }, { "epoch": 2.4986821296784396, "grad_norm": 7.4173206284608195, "learning_rate": 3.754612546125461e-07, "logits/chosen": -0.40681153535842896, "logits/rejected": -0.610400378704071, "logps/chosen": -357.25, "logps/rejected": -441.29998779296875, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.1260589361190796, "rewards/margins": 8.635937690734863, "rewards/rejected": -9.762499809265137, "step": 4740 }, { "epoch": 2.503953610964681, "grad_norm": 1.975729956826534, "learning_rate": 3.7414338429098576e-07, "logits/chosen": -0.38959962129592896, "logits/rejected": -0.5747314691543579, "logps/chosen": -425.75, "logps/rejected": -468.54998779296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.3281981945037842, "rewards/margins": 8.353124618530273, "rewards/rejected": -9.678125381469727, "step": 4750 }, { "epoch": 2.5092250922509223, "grad_norm": 22.906466123652756, "learning_rate": 3.728255139694254e-07, "logits/chosen": -0.565136730670929, "logits/rejected": -0.664111316204071, "logps/chosen": -405.20001220703125, "logps/rejected": -456.29998779296875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.742285132408142, "rewards/margins": 8.014062881469727, "rewards/rejected": -9.762499809265137, "step": 4760 }, { "epoch": 2.514496573537164, "grad_norm": 14.482357415111892, "learning_rate": 3.71507643647865e-07, "logits/chosen": -0.3931030333042145, "logits/rejected": -0.4639221131801605, "logps/chosen": -362.3500061035156, "logps/rejected": -437.8999938964844, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.8408447504043579, "rewards/margins": 8.173437118530273, "rewards/rejected": -9.015625, "step": 4770 }, { "epoch": 2.5197680548234054, "grad_norm": 8.920962414953204, "learning_rate": 3.701897733263047e-07, "logits/chosen": -0.2917236387729645, "logits/rejected": -0.546875, "logps/chosen": -399.8500061035156, "logps/rejected": -452.1000061035156, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.9975341558456421, "rewards/margins": 8.4453125, "rewards/rejected": -9.442187309265137, "step": 4780 }, { "epoch": 2.525039536109647, "grad_norm": 74.98173177589238, "learning_rate": 3.688719030047443e-07, "logits/chosen": -0.162994384765625, "logits/rejected": -0.5218261480331421, "logps/chosen": -373.6000061035156, "logps/rejected": -376.25, "loss": 0.0152, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.935595691204071, "rewards/margins": 7.426562309265137, "rewards/rejected": -8.3671875, "step": 4790 }, { "epoch": 2.5303110173958885, "grad_norm": 7.952530662738245, "learning_rate": 3.6755403268318395e-07, "logits/chosen": -0.4493347108364105, "logits/rejected": -0.562915027141571, "logps/chosen": -387.5, "logps/rejected": -408.6000061035156, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8606933355331421, "rewards/margins": 7.300000190734863, "rewards/rejected": -8.15625, "step": 4800 }, { "epoch": 2.5355824986821296, "grad_norm": 14.259093449707041, "learning_rate": 3.662361623616236e-07, "logits/chosen": -0.2750488221645355, "logits/rejected": -0.526684582233429, "logps/chosen": -383.29998779296875, "logps/rejected": -459.1000061035156, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.7403320074081421, "rewards/margins": 8.110937118530273, "rewards/rejected": -8.84375, "step": 4810 }, { "epoch": 2.540853979968371, "grad_norm": 72.47971999839423, "learning_rate": 3.6491829204006324e-07, "logits/chosen": -0.28863525390625, "logits/rejected": -0.595751941204071, "logps/chosen": -419.95001220703125, "logps/rejected": -443.3500061035156, "loss": 0.0303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.985034167766571, "rewards/margins": 8.354687690734863, "rewards/rejected": -9.340624809265137, "step": 4820 }, { "epoch": 2.5461254612546127, "grad_norm": 0.9546901590402234, "learning_rate": 3.6360042171850284e-07, "logits/chosen": -0.22850951552391052, "logits/rejected": -0.6024535894393921, "logps/chosen": -354.04998779296875, "logps/rejected": -415.8500061035156, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.045263648033142, "rewards/margins": 7.934374809265137, "rewards/rejected": -8.981249809265137, "step": 4830 }, { "epoch": 2.551396942540854, "grad_norm": 2.149661786917239, "learning_rate": 3.6228255139694254e-07, "logits/chosen": -0.3479247987270355, "logits/rejected": -0.65716552734375, "logps/chosen": -381.20001220703125, "logps/rejected": -414.8999938964844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.2613036632537842, "rewards/margins": 8.004687309265137, "rewards/rejected": -9.260937690734863, "step": 4840 }, { "epoch": 2.5566684238270954, "grad_norm": 2.648391324726363, "learning_rate": 3.609646810753822e-07, "logits/chosen": -0.38105469942092896, "logits/rejected": -0.49492186307907104, "logps/chosen": -415.3999938964844, "logps/rejected": -436.20001220703125, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -1.19873046875, "rewards/margins": 7.784375190734863, "rewards/rejected": -8.995312690734863, "step": 4850 }, { "epoch": 2.561939905113337, "grad_norm": 0.9166496909985922, "learning_rate": 3.5964681075382183e-07, "logits/chosen": -0.20719298720359802, "logits/rejected": -0.4170471131801605, "logps/chosen": -400.6000061035156, "logps/rejected": -474.20001220703125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.912158191204071, "rewards/margins": 8.214062690734863, "rewards/rejected": -9.121874809265137, "step": 4860 }, { "epoch": 2.5672113863995785, "grad_norm": 5.620477223609186, "learning_rate": 3.583289404322615e-07, "logits/chosen": -0.35887449979782104, "logits/rejected": -0.59710693359375, "logps/chosen": -406.70001220703125, "logps/rejected": -421.3500061035156, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.682293713092804, "rewards/margins": 7.698437690734863, "rewards/rejected": -8.379687309265137, "step": 4870 }, { "epoch": 2.5724828676858196, "grad_norm": 2.1864225720319372, "learning_rate": 3.570110701107011e-07, "logits/chosen": -0.147705078125, "logits/rejected": -0.4453125, "logps/chosen": -407.8999938964844, "logps/rejected": -407.8500061035156, "loss": 0.0186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.719805896282196, "rewards/margins": 7.5078125, "rewards/rejected": -8.225000381469727, "step": 4880 }, { "epoch": 2.577754348972061, "grad_norm": 70.70948847192957, "learning_rate": 3.556931997891408e-07, "logits/chosen": -0.3884018063545227, "logits/rejected": -0.4646972715854645, "logps/chosen": -367.6499938964844, "logps/rejected": -425.3999938964844, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.192724585533142, "rewards/margins": 7.732812404632568, "rewards/rejected": -8.928125381469727, "step": 4890 }, { "epoch": 2.5830258302583027, "grad_norm": 8.914638681364167, "learning_rate": 3.5437532946758037e-07, "logits/chosen": -0.3766845762729645, "logits/rejected": -0.5653320550918579, "logps/chosen": -335.3999938964844, "logps/rejected": -402.8999938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.1393554210662842, "rewards/margins": 7.379687309265137, "rewards/rejected": -8.520312309265137, "step": 4900 }, { "epoch": 2.588297311544544, "grad_norm": 103.73404854879043, "learning_rate": 3.5305745914602e-07, "logits/chosen": -0.25103759765625, "logits/rejected": -0.5426025390625, "logps/chosen": -356.45001220703125, "logps/rejected": -451.5, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3760254383087158, "rewards/margins": 8.103124618530273, "rewards/rejected": -9.479687690734863, "step": 4910 }, { "epoch": 2.5935687928307853, "grad_norm": 6.326206500383713, "learning_rate": 3.5173958882445966e-07, "logits/chosen": -0.31376951932907104, "logits/rejected": -0.46156007051467896, "logps/chosen": -385.75, "logps/rejected": -429.6000061035156, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -0.9732666015625, "rewards/margins": 7.854687690734863, "rewards/rejected": -8.829687118530273, "step": 4920 }, { "epoch": 2.598840274117027, "grad_norm": 4.673644274040635, "learning_rate": 3.504217185028993e-07, "logits/chosen": -0.43205565214157104, "logits/rejected": -0.5284179449081421, "logps/chosen": -384.6499938964844, "logps/rejected": -416.79998779296875, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.33781737089157104, "rewards/margins": 7.545312404632568, "rewards/rejected": -7.873437404632568, "step": 4930 }, { "epoch": 2.6041117554032684, "grad_norm": 4.538672763749924, "learning_rate": 3.491038481813389e-07, "logits/chosen": -0.22139891982078552, "logits/rejected": -0.45952147245407104, "logps/chosen": -401.70001220703125, "logps/rejected": -463.25, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.3339477479457855, "rewards/margins": 7.90625, "rewards/rejected": -8.25, "step": 4940 }, { "epoch": 2.60938323668951, "grad_norm": 17.972219989410316, "learning_rate": 3.477859778597786e-07, "logits/chosen": -0.21462401747703552, "logits/rejected": -0.3244384825229645, "logps/chosen": -421.75, "logps/rejected": -426.0, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -0.611376941204071, "rewards/margins": 7.160937309265137, "rewards/rejected": -7.768750190734863, "step": 4950 }, { "epoch": 2.614654717975751, "grad_norm": 4.15004342607609, "learning_rate": 3.464681075382182e-07, "logits/chosen": -0.20628662407398224, "logits/rejected": -0.46022337675094604, "logps/chosen": -369.70001220703125, "logps/rejected": -426.6499938964844, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6710449457168579, "rewards/margins": 7.6015625, "rewards/rejected": -8.279687881469727, "step": 4960 }, { "epoch": 2.6199261992619927, "grad_norm": 1.2554000380408925, "learning_rate": 3.4515023721665785e-07, "logits/chosen": -0.15476074814796448, "logits/rejected": -0.4797515869140625, "logps/chosen": -367.79998779296875, "logps/rejected": -370.54998779296875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.62548828125, "rewards/margins": 7.178124904632568, "rewards/rejected": -7.803124904632568, "step": 4970 }, { "epoch": 2.625197680548234, "grad_norm": 5.360944976912797, "learning_rate": 3.438323668950975e-07, "logits/chosen": -0.16298218071460724, "logits/rejected": -0.6297363042831421, "logps/chosen": -348.45001220703125, "logps/rejected": -386.95001220703125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.5072265863418579, "rewards/margins": 7.875, "rewards/rejected": -8.384374618530273, "step": 4980 }, { "epoch": 2.6304691618344753, "grad_norm": 1.6686408214215738, "learning_rate": 3.4251449657353714e-07, "logits/chosen": -0.32622069120407104, "logits/rejected": -0.3419250547885895, "logps/chosen": -388.6000061035156, "logps/rejected": -450.8999938964844, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.4961181581020355, "rewards/margins": 7.792187690734863, "rewards/rejected": -8.292187690734863, "step": 4990 }, { "epoch": 2.635740643120717, "grad_norm": 0.5998418777955173, "learning_rate": 3.411966262519768e-07, "logits/chosen": -0.17811278998851776, "logits/rejected": -0.610034167766571, "logps/chosen": -401.1000061035156, "logps/rejected": -408.45001220703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5987304449081421, "rewards/margins": 7.675000190734863, "rewards/rejected": -8.28125, "step": 5000 }, { "epoch": 2.6410121244069584, "grad_norm": 7.263992212881305, "learning_rate": 3.3987875593041644e-07, "logits/chosen": -0.3087402284145355, "logits/rejected": -0.5065063238143921, "logps/chosen": -330.5, "logps/rejected": -352.79998779296875, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.679443359375, "rewards/margins": 7.420312404632568, "rewards/rejected": -8.096875190734863, "step": 5010 }, { "epoch": 2.6462836056932, "grad_norm": 10.799327218089909, "learning_rate": 3.385608856088561e-07, "logits/chosen": -0.3046630918979645, "logits/rejected": -0.3982788026332855, "logps/chosen": -378.54998779296875, "logps/rejected": -443.70001220703125, "loss": 0.019, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9379638433456421, "rewards/margins": 7.339062690734863, "rewards/rejected": -8.278124809265137, "step": 5020 }, { "epoch": 2.651555086979441, "grad_norm": 2.9933811204616982, "learning_rate": 3.372430152872957e-07, "logits/chosen": -0.41724854707717896, "logits/rejected": -0.566845715045929, "logps/chosen": -377.04998779296875, "logps/rejected": -389.54998779296875, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.980700671672821, "rewards/margins": 7.360937595367432, "rewards/rejected": -8.332812309265137, "step": 5030 }, { "epoch": 2.6568265682656826, "grad_norm": 2.139906417701744, "learning_rate": 3.359251449657354e-07, "logits/chosen": -0.26777344942092896, "logits/rejected": -0.5028167963027954, "logps/chosen": -397.0, "logps/rejected": -387.95001220703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.7501220703125, "rewards/margins": 7.714062690734863, "rewards/rejected": -8.464062690734863, "step": 5040 }, { "epoch": 2.662098049551924, "grad_norm": 1.7187229054335453, "learning_rate": 3.34607274644175e-07, "logits/chosen": -0.27851563692092896, "logits/rejected": -0.5394653081893921, "logps/chosen": -442.6000061035156, "logps/rejected": -451.3999938964844, "loss": 0.0124, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.020257592201233, "rewards/margins": 7.865624904632568, "rewards/rejected": -8.881250381469727, "step": 5050 }, { "epoch": 2.6673695308381653, "grad_norm": 4.527282447367034, "learning_rate": 3.332894043226147e-07, "logits/chosen": -0.1497802734375, "logits/rejected": -0.56524658203125, "logps/chosen": -362.20001220703125, "logps/rejected": -404.29998779296875, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -1.124609351158142, "rewards/margins": 8.403124809265137, "rewards/rejected": -9.5234375, "step": 5060 }, { "epoch": 2.672641012124407, "grad_norm": 1.0929825436682807, "learning_rate": 3.3197153400105427e-07, "logits/chosen": -0.26362305879592896, "logits/rejected": -0.5739501714706421, "logps/chosen": -411.0, "logps/rejected": -432.5, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.16796875, "rewards/margins": 7.923437595367432, "rewards/rejected": -9.092187881469727, "step": 5070 }, { "epoch": 2.6779124934106484, "grad_norm": 5.34096045304511, "learning_rate": 3.306536636794939e-07, "logits/chosen": -0.4308410584926605, "logits/rejected": -0.5116821527481079, "logps/chosen": -374.1499938964844, "logps/rejected": -449.6000061035156, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.878491222858429, "rewards/margins": 7.907812595367432, "rewards/rejected": -8.782812118530273, "step": 5080 }, { "epoch": 2.68318397469689, "grad_norm": 10.494315941511568, "learning_rate": 3.2933579335793357e-07, "logits/chosen": -0.39799803495407104, "logits/rejected": -0.525714099407196, "logps/chosen": -375.04998779296875, "logps/rejected": -440.45001220703125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.0261108875274658, "rewards/margins": 7.479687690734863, "rewards/rejected": -8.504687309265137, "step": 5090 }, { "epoch": 2.6884554559831315, "grad_norm": 57.387809170098606, "learning_rate": 3.280179230363732e-07, "logits/chosen": -0.18799439072608948, "logits/rejected": -0.5331054925918579, "logps/chosen": -347.75, "logps/rejected": -431.70001220703125, "loss": 0.0448, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.120141625404358, "rewards/margins": 7.892187595367432, "rewards/rejected": -9.0078125, "step": 5100 }, { "epoch": 2.6937269372693726, "grad_norm": 3.312067915115212, "learning_rate": 3.267000527148128e-07, "logits/chosen": -0.37597352266311646, "logits/rejected": -0.567089855670929, "logps/chosen": -391.8999938964844, "logps/rejected": -424.54998779296875, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.0261719226837158, "rewards/margins": 8.254687309265137, "rewards/rejected": -9.2890625, "step": 5110 }, { "epoch": 2.698998418555614, "grad_norm": 3.732736477566125, "learning_rate": 3.253821823932525e-07, "logits/chosen": -0.17094726860523224, "logits/rejected": -0.5224243402481079, "logps/chosen": -355.70001220703125, "logps/rejected": -407.95001220703125, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -1.2802245616912842, "rewards/margins": 7.901562690734863, "rewards/rejected": -9.1796875, "step": 5120 }, { "epoch": 2.7042698998418553, "grad_norm": 2.2378107736569484, "learning_rate": 3.2406431207169216e-07, "logits/chosen": -0.3477722108364105, "logits/rejected": -0.4235778748989105, "logps/chosen": -351.75, "logps/rejected": -420.1000061035156, "loss": 0.0187, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.509423851966858, "rewards/margins": 8.0390625, "rewards/rejected": -9.542187690734863, "step": 5130 }, { "epoch": 2.709541381128097, "grad_norm": 3.1260865220423426, "learning_rate": 3.2274644175013175e-07, "logits/chosen": -0.36982423067092896, "logits/rejected": -0.546191394329071, "logps/chosen": -400.6000061035156, "logps/rejected": -419.1499938964844, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.6993163824081421, "rewards/margins": 7.765625, "rewards/rejected": -8.471875190734863, "step": 5140 }, { "epoch": 2.7148128624143384, "grad_norm": 4.0315451283720085, "learning_rate": 3.2142857142857145e-07, "logits/chosen": -0.33703309297561646, "logits/rejected": -0.5560058355331421, "logps/chosen": -406.0, "logps/rejected": -474.3999938964844, "loss": 0.0114, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.177099585533142, "rewards/margins": 8.326562881469727, "rewards/rejected": -9.498437881469727, "step": 5150 }, { "epoch": 2.72008434370058, "grad_norm": 24.44275766149383, "learning_rate": 3.2011070110701105e-07, "logits/chosen": -0.3585205078125, "logits/rejected": -0.4068603515625, "logps/chosen": -367.8500061035156, "logps/rejected": -428.3999938964844, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.125726342201233, "rewards/margins": 7.396874904632568, "rewards/rejected": -8.5234375, "step": 5160 }, { "epoch": 2.7253558249868215, "grad_norm": 2.660925626015545, "learning_rate": 3.187928307854507e-07, "logits/chosen": -0.14887695014476776, "logits/rejected": -0.3881469666957855, "logps/chosen": -389.1499938964844, "logps/rejected": -414.20001220703125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -1.198583960533142, "rewards/margins": 7.939062595367432, "rewards/rejected": -9.137499809265137, "step": 5170 }, { "epoch": 2.7306273062730626, "grad_norm": 13.253545770189755, "learning_rate": 3.1747496046389034e-07, "logits/chosen": -0.31010740995407104, "logits/rejected": -0.5484558343887329, "logps/chosen": -377.5, "logps/rejected": -452.1000061035156, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -1.102294921875, "rewards/margins": 8.471875190734863, "rewards/rejected": -9.576562881469727, "step": 5180 }, { "epoch": 2.735898787559304, "grad_norm": 0.6104293921488262, "learning_rate": 3.1615709014233e-07, "logits/chosen": -0.17537231743335724, "logits/rejected": -0.4792724549770355, "logps/chosen": -412.04998779296875, "logps/rejected": -434.45001220703125, "loss": 0.0168, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.393652319908142, "rewards/margins": 7.8203125, "rewards/rejected": -9.21875, "step": 5190 }, { "epoch": 2.7411702688455457, "grad_norm": 27.512340432909085, "learning_rate": 3.148392198207696e-07, "logits/chosen": -0.4679199159145355, "logits/rejected": -0.5761352777481079, "logps/chosen": -385.8500061035156, "logps/rejected": -446.1000061035156, "loss": 0.017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.255517601966858, "rewards/margins": 8.175000190734863, "rewards/rejected": -9.431249618530273, "step": 5200 }, { "epoch": 2.746441750131787, "grad_norm": 10.489578059395173, "learning_rate": 3.135213494992093e-07, "logits/chosen": -0.3965087831020355, "logits/rejected": -0.5929199457168579, "logps/chosen": -363.79998779296875, "logps/rejected": -452.20001220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.9438965320587158, "rewards/margins": 8.723437309265137, "rewards/rejected": -10.657812118530273, "step": 5210 }, { "epoch": 2.7517132314180284, "grad_norm": 7.5315294816175316, "learning_rate": 3.122034791776489e-07, "logits/chosen": -0.2725830078125, "logits/rejected": -0.4790405333042145, "logps/chosen": -388.6499938964844, "logps/rejected": -457.0, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -1.340673804283142, "rewards/margins": 8.412500381469727, "rewards/rejected": -9.746874809265137, "step": 5220 }, { "epoch": 2.75698471270427, "grad_norm": 21.515944296722548, "learning_rate": 3.108856088560886e-07, "logits/chosen": -0.4486633241176605, "logits/rejected": -0.4611450135707855, "logps/chosen": -384.79998779296875, "logps/rejected": -452.1000061035156, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.5768005847930908, "rewards/margins": 7.979687690734863, "rewards/rejected": -9.557812690734863, "step": 5230 }, { "epoch": 2.7622561939905115, "grad_norm": 18.23924257622937, "learning_rate": 3.095677385345282e-07, "logits/chosen": -0.4453674256801605, "logits/rejected": -0.556103527545929, "logps/chosen": -382.8500061035156, "logps/rejected": -406.25, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.62939453125, "rewards/margins": 7.878125190734863, "rewards/rejected": -8.510937690734863, "step": 5240 }, { "epoch": 2.767527675276753, "grad_norm": 1.091573947486351, "learning_rate": 3.082498682129678e-07, "logits/chosen": -0.201873779296875, "logits/rejected": -0.43034666776657104, "logps/chosen": -402.95001220703125, "logps/rejected": -467.20001220703125, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8449341058731079, "rewards/margins": 8.006250381469727, "rewards/rejected": -8.854687690734863, "step": 5250 }, { "epoch": 2.772799156562994, "grad_norm": 4.47787051137613, "learning_rate": 3.0693199789140747e-07, "logits/chosen": -0.33757323026657104, "logits/rejected": -0.4362548887729645, "logps/chosen": -334.45001220703125, "logps/rejected": -423.20001220703125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.0251953601837158, "rewards/margins": 7.618750095367432, "rewards/rejected": -8.649999618530273, "step": 5260 }, { "epoch": 2.7780706378492357, "grad_norm": 2.8097490972982935, "learning_rate": 3.056141275698471e-07, "logits/chosen": -0.35212403535842896, "logits/rejected": -0.511547863483429, "logps/chosen": -394.1000061035156, "logps/rejected": -422.75, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -1.222619652748108, "rewards/margins": 7.787499904632568, "rewards/rejected": -9.015625, "step": 5270 }, { "epoch": 2.783342119135477, "grad_norm": 0.8202390669924804, "learning_rate": 3.0429625724828676e-07, "logits/chosen": -0.351553350687027, "logits/rejected": -0.5472778081893921, "logps/chosen": -366.45001220703125, "logps/rejected": -424.8999938964844, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -1.567529320716858, "rewards/margins": 7.860937595367432, "rewards/rejected": -9.428125381469727, "step": 5280 }, { "epoch": 2.7886136004217184, "grad_norm": 3.2694632448182923, "learning_rate": 3.029783869267264e-07, "logits/chosen": -0.28619384765625, "logits/rejected": -0.608813464641571, "logps/chosen": -357.25, "logps/rejected": -454.8500061035156, "loss": 0.0197, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7887694835662842, "rewards/margins": 8.746874809265137, "rewards/rejected": -10.545312881469727, "step": 5290 }, { "epoch": 2.79388508170796, "grad_norm": 143.00564675988326, "learning_rate": 3.0166051660516606e-07, "logits/chosen": -0.432373046875, "logits/rejected": -0.6277168393135071, "logps/chosen": -370.6499938964844, "logps/rejected": -417.8999938964844, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6441528797149658, "rewards/margins": 8.245312690734863, "rewards/rejected": -9.896875381469727, "step": 5300 }, { "epoch": 2.7991565629942015, "grad_norm": 15.77350764673123, "learning_rate": 3.0034264628360565e-07, "logits/chosen": -0.37336426973342896, "logits/rejected": -0.59375, "logps/chosen": -404.3999938964844, "logps/rejected": -424.0, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.3122069835662842, "rewards/margins": 8.092187881469727, "rewards/rejected": -9.412500381469727, "step": 5310 }, { "epoch": 2.804428044280443, "grad_norm": 29.821467433406944, "learning_rate": 2.9902477596204535e-07, "logits/chosen": -0.17919921875, "logits/rejected": -0.45219725370407104, "logps/chosen": -389.25, "logps/rejected": -467.3999938964844, "loss": 0.0209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.391455054283142, "rewards/margins": 8.403124809265137, "rewards/rejected": -9.793749809265137, "step": 5320 }, { "epoch": 2.809699525566684, "grad_norm": 0.8462849694178508, "learning_rate": 2.9770690564048495e-07, "logits/chosen": -0.4839843809604645, "logits/rejected": -0.5108398199081421, "logps/chosen": -350.54998779296875, "logps/rejected": -431.54998779296875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.5403320789337158, "rewards/margins": 7.915625095367432, "rewards/rejected": -9.459375381469727, "step": 5330 }, { "epoch": 2.8149710068529257, "grad_norm": 30.4509500702529, "learning_rate": 2.963890353189246e-07, "logits/chosen": -0.44172364473342896, "logits/rejected": -0.49267578125, "logps/chosen": -371.54998779296875, "logps/rejected": -414.25, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2682616710662842, "rewards/margins": 7.1796875, "rewards/rejected": -8.443750381469727, "step": 5340 }, { "epoch": 2.8202424881391672, "grad_norm": 7.975848138472269, "learning_rate": 2.9507116499736424e-07, "logits/chosen": -0.17926025390625, "logits/rejected": -0.3725219666957855, "logps/chosen": -439.20001220703125, "logps/rejected": -439.6000061035156, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.81640625, "rewards/margins": 8.153124809265137, "rewards/rejected": -8.965624809265137, "step": 5350 }, { "epoch": 2.8255139694254083, "grad_norm": 6.250476271729425, "learning_rate": 2.937532946758039e-07, "logits/chosen": -0.30921632051467896, "logits/rejected": -0.5372680425643921, "logps/chosen": -387.54998779296875, "logps/rejected": -399.3500061035156, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -0.999804675579071, "rewards/margins": 7.535937309265137, "rewards/rejected": -8.528124809265137, "step": 5360 }, { "epoch": 2.83078545071165, "grad_norm": 5.736633084210226, "learning_rate": 2.924354243542435e-07, "logits/chosen": -0.13525390625, "logits/rejected": -0.3771423399448395, "logps/chosen": -416.1000061035156, "logps/rejected": -447.45001220703125, "loss": 0.0361, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6168457269668579, "rewards/margins": 7.699999809265137, "rewards/rejected": -8.3203125, "step": 5370 }, { "epoch": 2.8360569319978914, "grad_norm": 2.181122389634545, "learning_rate": 2.911175540326832e-07, "logits/chosen": -0.49454957246780396, "logits/rejected": -0.64208984375, "logps/chosen": -357.75, "logps/rejected": -401.6000061035156, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.907275378704071, "rewards/margins": 8.168749809265137, "rewards/rejected": -9.067187309265137, "step": 5380 }, { "epoch": 2.841328413284133, "grad_norm": 3.58718519366506, "learning_rate": 2.897996837111228e-07, "logits/chosen": -0.3484863340854645, "logits/rejected": -0.5274413824081421, "logps/chosen": -396.04998779296875, "logps/rejected": -406.20001220703125, "loss": 0.0156, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8626464605331421, "rewards/margins": 7.342187404632568, "rewards/rejected": -8.212499618530273, "step": 5390 }, { "epoch": 2.8465998945703745, "grad_norm": 3.35863737244598, "learning_rate": 2.884818133895625e-07, "logits/chosen": -0.3863159120082855, "logits/rejected": -0.579541027545929, "logps/chosen": -379.0, "logps/rejected": -443.95001220703125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.2058594226837158, "rewards/margins": 7.598437309265137, "rewards/rejected": -8.801562309265137, "step": 5400 }, { "epoch": 2.8518713758566157, "grad_norm": 1.965571190354165, "learning_rate": 2.871639430680021e-07, "logits/chosen": -0.39082640409469604, "logits/rejected": -0.69329833984375, "logps/chosen": -418.70001220703125, "logps/rejected": -429.79998779296875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.024316430091858, "rewards/margins": 8.206250190734863, "rewards/rejected": -9.237500190734863, "step": 5410 }, { "epoch": 2.857142857142857, "grad_norm": 12.330341086725376, "learning_rate": 2.858460727464417e-07, "logits/chosen": -0.31589967012405396, "logits/rejected": -0.4862609803676605, "logps/chosen": -351.6499938964844, "logps/rejected": -417.20001220703125, "loss": 0.0143, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.235937476158142, "rewards/margins": 8.135937690734863, "rewards/rejected": -9.370312690734863, "step": 5420 }, { "epoch": 2.8624143384290983, "grad_norm": 2.9029868807434562, "learning_rate": 2.845282024248814e-07, "logits/chosen": -0.2755126953125, "logits/rejected": -0.7448791265487671, "logps/chosen": -381.5, "logps/rejected": -443.6000061035156, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -1.294946312904358, "rewards/margins": 8.846875190734863, "rewards/rejected": -10.146875381469727, "step": 5430 }, { "epoch": 2.86768581971534, "grad_norm": 2.2318828064237612, "learning_rate": 2.83210332103321e-07, "logits/chosen": -0.15069580078125, "logits/rejected": -0.5205841064453125, "logps/chosen": -380.5, "logps/rejected": -422.25, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.572167992591858, "rewards/margins": 8.240625381469727, "rewards/rejected": -9.806249618530273, "step": 5440 }, { "epoch": 2.8729573010015814, "grad_norm": 2.650358200035014, "learning_rate": 2.8189246178176067e-07, "logits/chosen": -0.318765252828598, "logits/rejected": -0.588549792766571, "logps/chosen": -406.1000061035156, "logps/rejected": -440.5, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -1.274438500404358, "rewards/margins": 8.25, "rewards/rejected": -9.518750190734863, "step": 5450 }, { "epoch": 2.878228782287823, "grad_norm": 1.8785935971937713, "learning_rate": 2.805745914602003e-07, "logits/chosen": -0.24214477837085724, "logits/rejected": -0.43543702363967896, "logps/chosen": -391.25, "logps/rejected": -397.3999938964844, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -0.734619140625, "rewards/margins": 7.678124904632568, "rewards/rejected": -8.410937309265137, "step": 5460 }, { "epoch": 2.8835002635740645, "grad_norm": 5.802932940017478, "learning_rate": 2.7925672113863996e-07, "logits/chosen": -0.2650390565395355, "logits/rejected": -0.6595703363418579, "logps/chosen": -355.70001220703125, "logps/rejected": -400.45001220703125, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7864990234375, "rewards/margins": 7.378125190734863, "rewards/rejected": -8.1640625, "step": 5470 }, { "epoch": 2.8887717448603056, "grad_norm": 5.660010713310649, "learning_rate": 2.7793885081707956e-07, "logits/chosen": -0.37507933378219604, "logits/rejected": -0.50360107421875, "logps/chosen": -341.8500061035156, "logps/rejected": -426.29998779296875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -1.048486351966858, "rewards/margins": 7.9375, "rewards/rejected": -8.985937118530273, "step": 5480 }, { "epoch": 2.894043226146547, "grad_norm": 2.4380711728667896, "learning_rate": 2.7662098049551926e-07, "logits/chosen": -0.16023559868335724, "logits/rejected": -0.41168212890625, "logps/chosen": -446.29998779296875, "logps/rejected": -465.1000061035156, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.186364769935608, "rewards/margins": 7.940625190734863, "rewards/rejected": -9.126562118530273, "step": 5490 }, { "epoch": 2.8993147074327887, "grad_norm": 11.984971563553826, "learning_rate": 2.7530311017395885e-07, "logits/chosen": -0.34705811738967896, "logits/rejected": -0.4565673768520355, "logps/chosen": -376.79998779296875, "logps/rejected": -437.5, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -0.6198364496231079, "rewards/margins": 7.715624809265137, "rewards/rejected": -8.337499618530273, "step": 5500 }, { "epoch": 2.90458618871903, "grad_norm": 0.8354153835453048, "learning_rate": 2.739852398523985e-07, "logits/chosen": -0.3968749940395355, "logits/rejected": -0.587049126625061, "logps/chosen": -431.79998779296875, "logps/rejected": -469.5, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.0798218250274658, "rewards/margins": 8.096875190734863, "rewards/rejected": -9.1796875, "step": 5510 }, { "epoch": 2.9098576700052714, "grad_norm": 6.167011620145386, "learning_rate": 2.7266736953083815e-07, "logits/chosen": -0.232177734375, "logits/rejected": -0.512011706829071, "logps/chosen": -391.8999938964844, "logps/rejected": -459.8999938964844, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -1.039770483970642, "rewards/margins": 7.764062404632568, "rewards/rejected": -8.793749809265137, "step": 5520 }, { "epoch": 2.915129151291513, "grad_norm": 80.87586319837962, "learning_rate": 2.713494992092778e-07, "logits/chosen": -0.39069825410842896, "logits/rejected": -0.615234375, "logps/chosen": -392.3999938964844, "logps/rejected": -400.6000061035156, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8154296875, "rewards/margins": 7.731249809265137, "rewards/rejected": -9.546875, "step": 5530 }, { "epoch": 2.9204006325777545, "grad_norm": 1.4451062570025872, "learning_rate": 2.700316288877174e-07, "logits/chosen": -0.3654846251010895, "logits/rejected": -0.58905029296875, "logps/chosen": -370.20001220703125, "logps/rejected": -413.1499938964844, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.501953125, "rewards/margins": 8.089062690734863, "rewards/rejected": -9.590624809265137, "step": 5540 }, { "epoch": 2.925672113863996, "grad_norm": 7.730066664419544, "learning_rate": 2.687137585661571e-07, "logits/chosen": -0.3307128846645355, "logits/rejected": -0.571667492389679, "logps/chosen": -388.95001220703125, "logps/rejected": -449.3999938964844, "loss": 0.0186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0357666015625, "rewards/margins": 7.887499809265137, "rewards/rejected": -8.923437118530273, "step": 5550 }, { "epoch": 2.930943595150237, "grad_norm": 5.61027637367317, "learning_rate": 2.6739588824459674e-07, "logits/chosen": -0.20627442002296448, "logits/rejected": -0.4129272401332855, "logps/chosen": -437.70001220703125, "logps/rejected": -468.6000061035156, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.880908191204071, "rewards/margins": 8.181249618530273, "rewards/rejected": -9.0625, "step": 5560 }, { "epoch": 2.9362150764364787, "grad_norm": 1.4924859901347787, "learning_rate": 2.660780179230364e-07, "logits/chosen": -0.3503662049770355, "logits/rejected": -0.5128418207168579, "logps/chosen": -421.6499938964844, "logps/rejected": -442.45001220703125, "loss": 0.019, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.168066382408142, "rewards/margins": 8.076562881469727, "rewards/rejected": -9.239062309265137, "step": 5570 }, { "epoch": 2.94148655772272, "grad_norm": 14.624159877031316, "learning_rate": 2.6476014760147603e-07, "logits/chosen": -0.18511962890625, "logits/rejected": -0.538134753704071, "logps/chosen": -419.3999938964844, "logps/rejected": -418.70001220703125, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1914551258087158, "rewards/margins": 7.8125, "rewards/rejected": -8.998437881469727, "step": 5580 }, { "epoch": 2.9467580390089614, "grad_norm": 9.562056125530658, "learning_rate": 2.634422772799156e-07, "logits/chosen": -0.4042724668979645, "logits/rejected": -0.616503894329071, "logps/chosen": -393.0, "logps/rejected": -433.79998779296875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -1.2961914539337158, "rewards/margins": 7.65625, "rewards/rejected": -8.959375381469727, "step": 5590 }, { "epoch": 2.952029520295203, "grad_norm": 17.41816645445159, "learning_rate": 2.6212440695835533e-07, "logits/chosen": -0.22416992485523224, "logits/rejected": -0.522412121295929, "logps/chosen": -399.6499938964844, "logps/rejected": -432.79998779296875, "loss": 0.0248, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.108667016029358, "rewards/margins": 8.081250190734863, "rewards/rejected": -9.198437690734863, "step": 5600 }, { "epoch": 2.9573010015814445, "grad_norm": 0.46035094806031784, "learning_rate": 2.608065366367949e-07, "logits/chosen": -0.3370361328125, "logits/rejected": -0.48286134004592896, "logps/chosen": -400.0, "logps/rejected": -469.20001220703125, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.227197289466858, "rewards/margins": 8.475000381469727, "rewards/rejected": -9.704687118530273, "step": 5610 }, { "epoch": 2.962572482867686, "grad_norm": 3.685237361253025, "learning_rate": 2.5948866631523457e-07, "logits/chosen": -0.2575927674770355, "logits/rejected": -0.45673829317092896, "logps/chosen": -370.95001220703125, "logps/rejected": -431.04998779296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.2584960460662842, "rewards/margins": 8.009374618530273, "rewards/rejected": -9.260937690734863, "step": 5620 }, { "epoch": 2.967843964153927, "grad_norm": 13.515783537666813, "learning_rate": 2.581707959936742e-07, "logits/chosen": -0.38264161348342896, "logits/rejected": -0.5221649408340454, "logps/chosen": -382.04998779296875, "logps/rejected": -444.5, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.9145019054412842, "rewards/margins": 8.356249809265137, "rewards/rejected": -10.274999618530273, "step": 5630 }, { "epoch": 2.9731154454401687, "grad_norm": 0.9200824159633252, "learning_rate": 2.5685292567211386e-07, "logits/chosen": -0.34223634004592896, "logits/rejected": -0.5779174566268921, "logps/chosen": -379.54998779296875, "logps/rejected": -407.1499938964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.44775390625, "rewards/margins": 8.859375, "rewards/rejected": -10.309374809265137, "step": 5640 }, { "epoch": 2.9783869267264103, "grad_norm": 17.80290116858315, "learning_rate": 2.5553505535055346e-07, "logits/chosen": -0.23953858017921448, "logits/rejected": -0.644482433795929, "logps/chosen": -382.0, "logps/rejected": -429.8999938964844, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.4100220203399658, "rewards/margins": 7.946875095367432, "rewards/rejected": -9.357812881469727, "step": 5650 }, { "epoch": 2.9836584080126514, "grad_norm": 91.0996067658643, "learning_rate": 2.5421718502899316e-07, "logits/chosen": -0.25513917207717896, "logits/rejected": -0.4976440370082855, "logps/chosen": -387.3999938964844, "logps/rejected": -438.1000061035156, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.58740234375, "rewards/margins": 8.2421875, "rewards/rejected": -9.834375381469727, "step": 5660 }, { "epoch": 2.988929889298893, "grad_norm": 13.485013162733248, "learning_rate": 2.5289931470743275e-07, "logits/chosen": -0.315338134765625, "logits/rejected": -0.4541015625, "logps/chosen": -365.0, "logps/rejected": -457.3999938964844, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8073241710662842, "rewards/margins": 8.079687118530273, "rewards/rejected": -9.890625, "step": 5670 }, { "epoch": 2.9942013705851345, "grad_norm": 27.693137997203152, "learning_rate": 2.515814443858724e-07, "logits/chosen": -0.22316893935203552, "logits/rejected": -0.712261974811554, "logps/chosen": -379.79998779296875, "logps/rejected": -421.20001220703125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.9787108898162842, "rewards/margins": 8.407812118530273, "rewards/rejected": -10.3828125, "step": 5680 }, { "epoch": 2.999472851871376, "grad_norm": 0.8949321866783594, "learning_rate": 2.5026357406431205e-07, "logits/chosen": -0.24069824814796448, "logits/rejected": -0.599658191204071, "logps/chosen": -353.0, "logps/rejected": -415.8999938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.745263695716858, "rewards/margins": 8.225000381469727, "rewards/rejected": -9.967187881469727, "step": 5690 }, { "epoch": 3.004744333157617, "grad_norm": 0.4088110528146211, "learning_rate": 2.489457037427517e-07, "logits/chosen": -0.35895997285842896, "logits/rejected": -0.552490234375, "logps/chosen": -388.54998779296875, "logps/rejected": -455.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.36651611328125, "rewards/margins": 9.487500190734863, "rewards/rejected": -10.850000381469727, "step": 5700 }, { "epoch": 3.0100158144438587, "grad_norm": 3.1599879720466126, "learning_rate": 2.4762783342119134e-07, "logits/chosen": -0.38330078125, "logits/rejected": -0.539501965045929, "logps/chosen": -405.70001220703125, "logps/rejected": -467.6000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.934033215045929, "rewards/margins": 9.689062118530273, "rewards/rejected": -10.621874809265137, "step": 5710 }, { "epoch": 3.0152872957301002, "grad_norm": 0.510611374962784, "learning_rate": 2.46309963099631e-07, "logits/chosen": -0.21137695014476776, "logits/rejected": -0.5085045099258423, "logps/chosen": -392.75, "logps/rejected": -455.75, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.912646472454071, "rewards/margins": 9.684374809265137, "rewards/rejected": -10.603124618530273, "step": 5720 }, { "epoch": 3.020558777016342, "grad_norm": 4.270841373395058, "learning_rate": 2.4499209277807064e-07, "logits/chosen": -0.3039306700229645, "logits/rejected": -0.619921863079071, "logps/chosen": -376.1499938964844, "logps/rejected": -425.25, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.3083007335662842, "rewards/margins": 9.159375190734863, "rewards/rejected": -10.467187881469727, "step": 5730 }, { "epoch": 3.025830258302583, "grad_norm": 1.1787674495109797, "learning_rate": 2.4367422245651023e-07, "logits/chosen": -0.36778563261032104, "logits/rejected": -0.590649425983429, "logps/chosen": -371.95001220703125, "logps/rejected": -407.6499938964844, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.0239379405975342, "rewards/margins": 8.767187118530273, "rewards/rejected": -9.790624618530273, "step": 5740 }, { "epoch": 3.0311017395888245, "grad_norm": 0.5132603786286064, "learning_rate": 2.423563521349499e-07, "logits/chosen": -0.2918334901332855, "logits/rejected": -0.6058593988418579, "logps/chosen": -417.25, "logps/rejected": -461.6000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.9278320074081421, "rewards/margins": 9.306249618530273, "rewards/rejected": -10.234375, "step": 5750 }, { "epoch": 3.036373220875066, "grad_norm": 0.24705872982989527, "learning_rate": 2.4103848181338953e-07, "logits/chosen": -0.303536981344223, "logits/rejected": -0.6233886480331421, "logps/chosen": -395.8999938964844, "logps/rejected": -420.1000061035156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -1.3043701648712158, "rewards/margins": 9.112500190734863, "rewards/rejected": -10.407812118530273, "step": 5760 }, { "epoch": 3.041644702161307, "grad_norm": 1.345610155600766, "learning_rate": 2.3972061149182923e-07, "logits/chosen": -0.2973388731479645, "logits/rejected": -0.48707276582717896, "logps/chosen": -412.6000061035156, "logps/rejected": -465.8500061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.398290991783142, "rewards/margins": 9.520312309265137, "rewards/rejected": -10.915624618530273, "step": 5770 }, { "epoch": 3.0469161834475487, "grad_norm": 4.262821656090499, "learning_rate": 2.3840274117026885e-07, "logits/chosen": -0.34447020292282104, "logits/rejected": -0.7544921636581421, "logps/chosen": -385.0, "logps/rejected": -457.79998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.1747069358825684, "rewards/margins": 9.401562690734863, "rewards/rejected": -11.5625, "step": 5780 }, { "epoch": 3.05218766473379, "grad_norm": 2.6306480255398172, "learning_rate": 2.370848708487085e-07, "logits/chosen": -0.2988037168979645, "logits/rejected": -0.56549072265625, "logps/chosen": -378.54998779296875, "logps/rejected": -456.29998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.3023924827575684, "rewards/margins": 9.134374618530273, "rewards/rejected": -11.440625190734863, "step": 5790 }, { "epoch": 3.0574591460200318, "grad_norm": 11.40798445792444, "learning_rate": 2.3576700052714812e-07, "logits/chosen": -0.08225097507238388, "logits/rejected": -0.5296570062637329, "logps/chosen": -394.8999938964844, "logps/rejected": -452.79998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.667578101158142, "rewards/margins": 9.514062881469727, "rewards/rejected": -11.190625190734863, "step": 5800 }, { "epoch": 3.062730627306273, "grad_norm": 0.5278653515280484, "learning_rate": 2.3444913020558777e-07, "logits/chosen": -0.26670533418655396, "logits/rejected": -0.613861083984375, "logps/chosen": -382.04998779296875, "logps/rejected": -490.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6417968273162842, "rewards/margins": 10.546875, "rewards/rejected": -12.184374809265137, "step": 5810 }, { "epoch": 3.0680021085925144, "grad_norm": 0.44892799236387976, "learning_rate": 2.3313125988402741e-07, "logits/chosen": -0.4080566465854645, "logits/rejected": -0.673754870891571, "logps/chosen": -404.04998779296875, "logps/rejected": -472.6000061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.4691650867462158, "rewards/margins": 9.167187690734863, "rewards/rejected": -10.637499809265137, "step": 5820 }, { "epoch": 3.073273589878756, "grad_norm": 0.4627388775184645, "learning_rate": 2.3181338956246703e-07, "logits/chosen": -0.46772462129592896, "logits/rejected": -0.77099609375, "logps/chosen": -411.54998779296875, "logps/rejected": -469.70001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.7908203601837158, "rewards/margins": 9.640625, "rewards/rejected": -11.4375, "step": 5830 }, { "epoch": 3.0785450711649975, "grad_norm": 3.207272647677549, "learning_rate": 2.3049551924090668e-07, "logits/chosen": -0.4591308534145355, "logits/rejected": -0.6483215093612671, "logps/chosen": -378.29998779296875, "logps/rejected": -445.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.728515625, "rewards/margins": 9.729687690734863, "rewards/rejected": -11.456250190734863, "step": 5840 }, { "epoch": 3.0838165524512386, "grad_norm": 0.26370266476489507, "learning_rate": 2.2917764891934633e-07, "logits/chosen": -0.26313477754592896, "logits/rejected": -0.6598449945449829, "logps/chosen": -432.3999938964844, "logps/rejected": -464.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.536035180091858, "rewards/margins": 9.603124618530273, "rewards/rejected": -11.143750190734863, "step": 5850 }, { "epoch": 3.08908803373748, "grad_norm": 0.4039145153598262, "learning_rate": 2.2785977859778595e-07, "logits/chosen": -0.25788575410842896, "logits/rejected": -0.4913085997104645, "logps/chosen": -382.6000061035156, "logps/rejected": -486.8999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.6090819835662842, "rewards/margins": 9.46875, "rewards/rejected": -11.065625190734863, "step": 5860 }, { "epoch": 3.0943595150237218, "grad_norm": 0.47366428053594223, "learning_rate": 2.265419082762256e-07, "logits/chosen": -0.45136719942092896, "logits/rejected": -0.631152331829071, "logps/chosen": -391.3500061035156, "logps/rejected": -437.1000061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.4664063453674316, "rewards/margins": 9.059374809265137, "rewards/rejected": -11.524999618530273, "step": 5870 }, { "epoch": 3.0996309963099633, "grad_norm": 0.5501550682881512, "learning_rate": 2.2522403795466525e-07, "logits/chosen": -0.523242175579071, "logits/rejected": -0.6559814214706421, "logps/chosen": -390.6000061035156, "logps/rejected": -450.3999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.236669898033142, "rewards/margins": 9.725000381469727, "rewards/rejected": -10.967187881469727, "step": 5880 }, { "epoch": 3.1049024775962044, "grad_norm": 0.4016498125335753, "learning_rate": 2.239061676331049e-07, "logits/chosen": -0.46422117948532104, "logits/rejected": -0.6295166015625, "logps/chosen": -406.0, "logps/rejected": -466.6000061035156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.9070312976837158, "rewards/margins": 9.442187309265137, "rewards/rejected": -11.346875190734863, "step": 5890 }, { "epoch": 3.110173958882446, "grad_norm": 0.6977570865593005, "learning_rate": 2.2258829731154451e-07, "logits/chosen": -0.386962890625, "logits/rejected": -0.704394519329071, "logps/chosen": -354.0, "logps/rejected": -419.20001220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -1.667626976966858, "rewards/margins": 9.5859375, "rewards/rejected": -11.253125190734863, "step": 5900 }, { "epoch": 3.1154454401686875, "grad_norm": 0.41764840239835443, "learning_rate": 2.212704269899842e-07, "logits/chosen": -0.24842528998851776, "logits/rejected": -0.707324206829071, "logps/chosen": -407.8500061035156, "logps/rejected": -452.3999938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.7748534679412842, "rewards/margins": 9.018750190734863, "rewards/rejected": -10.7890625, "step": 5910 }, { "epoch": 3.1207169214549286, "grad_norm": 0.3378516034395203, "learning_rate": 2.1995255666842384e-07, "logits/chosen": -0.286590576171875, "logits/rejected": -0.626782238483429, "logps/chosen": -443.1000061035156, "logps/rejected": -483.79998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.850000023841858, "rewards/margins": 9.301562309265137, "rewards/rejected": -11.146875381469727, "step": 5920 }, { "epoch": 3.12598840274117, "grad_norm": 0.82195655524876, "learning_rate": 2.1863468634686346e-07, "logits/chosen": -0.313070684671402, "logits/rejected": -0.542651355266571, "logps/chosen": -369.6000061035156, "logps/rejected": -444.70001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.7986328601837158, "rewards/margins": 9.4140625, "rewards/rejected": -11.215624809265137, "step": 5930 }, { "epoch": 3.1312598840274117, "grad_norm": 3.8635525887689135, "learning_rate": 2.173168160253031e-07, "logits/chosen": -0.34906309843063354, "logits/rejected": -0.573071300983429, "logps/chosen": -371.04998779296875, "logps/rejected": -433.5, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.6618163585662842, "rewards/margins": 9.637499809265137, "rewards/rejected": -11.295312881469727, "step": 5940 }, { "epoch": 3.1365313653136533, "grad_norm": 0.5900894872478385, "learning_rate": 2.1599894570374275e-07, "logits/chosen": -0.30487060546875, "logits/rejected": -0.5698486566543579, "logps/chosen": -432.75, "logps/rejected": -476.0, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.4840819835662842, "rewards/margins": 9.470312118530273, "rewards/rejected": -10.956250190734863, "step": 5950 }, { "epoch": 3.1418028465998944, "grad_norm": 3.0825450159500787, "learning_rate": 2.1468107538218237e-07, "logits/chosen": -0.3228820860385895, "logits/rejected": -0.6187957525253296, "logps/chosen": -412.75, "logps/rejected": -475.70001220703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.8193359375, "rewards/margins": 9.34375, "rewards/rejected": -11.175000190734863, "step": 5960 }, { "epoch": 3.147074327886136, "grad_norm": 4.3181609625684505, "learning_rate": 2.1336320506062202e-07, "logits/chosen": -0.46528321504592896, "logits/rejected": -0.7803710699081421, "logps/chosen": -396.45001220703125, "logps/rejected": -434.20001220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.632421851158142, "rewards/margins": 9.784375190734863, "rewards/rejected": -11.415624618530273, "step": 5970 }, { "epoch": 3.1523458091723775, "grad_norm": 1.0555348285793624, "learning_rate": 2.1204533473906167e-07, "logits/chosen": -0.3575195372104645, "logits/rejected": -0.7641662359237671, "logps/chosen": -358.29998779296875, "logps/rejected": -453.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.1875, "rewards/margins": 10.199999809265137, "rewards/rejected": -12.390625, "step": 5980 }, { "epoch": 3.157617290458619, "grad_norm": 0.6154014137913136, "learning_rate": 2.1072746441750132e-07, "logits/chosen": -0.19678345322608948, "logits/rejected": -0.648144543170929, "logps/chosen": -419.1499938964844, "logps/rejected": -475.79998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5695312023162842, "rewards/margins": 9.939062118530273, "rewards/rejected": -11.506250381469727, "step": 5990 }, { "epoch": 3.16288877174486, "grad_norm": 1.4168962415907116, "learning_rate": 2.0940959409594094e-07, "logits/chosen": -0.3916259706020355, "logits/rejected": -0.666186511516571, "logps/chosen": -354.8500061035156, "logps/rejected": -445.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.7403380870819092, "rewards/margins": 9.978124618530273, "rewards/rejected": -11.715624809265137, "step": 6000 }, { "epoch": 3.1681602530311017, "grad_norm": 0.4557113930884351, "learning_rate": 2.0809172377438058e-07, "logits/chosen": -0.543872058391571, "logits/rejected": -0.657397449016571, "logps/chosen": -385.6000061035156, "logps/rejected": -454.5, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.702880859375, "rewards/margins": 9.71875, "rewards/rejected": -11.425000190734863, "step": 6010 }, { "epoch": 3.1734317343173433, "grad_norm": 0.9313671944355586, "learning_rate": 2.0677385345282023e-07, "logits/chosen": -0.4543212950229645, "logits/rejected": -0.6709960699081421, "logps/chosen": -409.1499938964844, "logps/rejected": -482.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0045409202575684, "rewards/margins": 9.417187690734863, "rewards/rejected": -11.418749809265137, "step": 6020 }, { "epoch": 3.1787032156035844, "grad_norm": 1.1117208371072003, "learning_rate": 2.0545598313125985e-07, "logits/chosen": -0.6442626714706421, "logits/rejected": -0.6862426996231079, "logps/chosen": -419.3999938964844, "logps/rejected": -498.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.250195264816284, "rewards/margins": 9.496874809265137, "rewards/rejected": -11.734375, "step": 6030 }, { "epoch": 3.183974696889826, "grad_norm": 0.5903606201360145, "learning_rate": 2.041381128096995e-07, "logits/chosen": -0.3529296815395355, "logits/rejected": -0.692822277545929, "logps/chosen": -389.70001220703125, "logps/rejected": -484.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.1631836891174316, "rewards/margins": 10.493749618530273, "rewards/rejected": -12.65625, "step": 6040 }, { "epoch": 3.1892461781760675, "grad_norm": 2.494607107915243, "learning_rate": 2.0282024248813917e-07, "logits/chosen": -0.45280760526657104, "logits/rejected": -0.5946289300918579, "logps/chosen": -371.0, "logps/rejected": -487.1000061035156, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.730200171470642, "rewards/margins": 9.75, "rewards/rejected": -11.481249809265137, "step": 6050 }, { "epoch": 3.194517659462309, "grad_norm": 0.7603919943213928, "learning_rate": 2.0150237216657882e-07, "logits/chosen": -0.47911375761032104, "logits/rejected": -0.6798340082168579, "logps/chosen": -388.3500061035156, "logps/rejected": -420.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.79736328125, "rewards/margins": 9.732812881469727, "rewards/rejected": -11.534375190734863, "step": 6060 }, { "epoch": 3.19978914074855, "grad_norm": 1.7234392590235703, "learning_rate": 2.0018450184501844e-07, "logits/chosen": -0.570751965045929, "logits/rejected": -0.7232666015625, "logps/chosen": -370.29998779296875, "logps/rejected": -427.5, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.8196289539337158, "rewards/margins": 9.1328125, "rewards/rejected": -10.962499618530273, "step": 6070 }, { "epoch": 3.2050606220347917, "grad_norm": 2.222528730383596, "learning_rate": 1.988666315234581e-07, "logits/chosen": -0.5366576910018921, "logits/rejected": -0.796459972858429, "logps/chosen": -416.6000061035156, "logps/rejected": -443.8999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.234570264816284, "rewards/margins": 9.262499809265137, "rewards/rejected": -11.496874809265137, "step": 6080 }, { "epoch": 3.2103321033210332, "grad_norm": 0.15447319032880652, "learning_rate": 1.9754876120189774e-07, "logits/chosen": -0.581347644329071, "logits/rejected": -0.71875, "logps/chosen": -412.54998779296875, "logps/rejected": -459.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9453125, "rewards/margins": 9.865625381469727, "rewards/rejected": -11.815625190734863, "step": 6090 }, { "epoch": 3.215603584607275, "grad_norm": 1.065231858638598, "learning_rate": 1.9623089088033736e-07, "logits/chosen": -0.23073729872703552, "logits/rejected": -0.6479247808456421, "logps/chosen": -402.3999938964844, "logps/rejected": -448.70001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.4689452648162842, "rewards/margins": 9.609375, "rewards/rejected": -11.084375381469727, "step": 6100 }, { "epoch": 3.220875065893516, "grad_norm": 4.254073755425588, "learning_rate": 1.94913020558777e-07, "logits/chosen": -0.35651856660842896, "logits/rejected": -0.74365234375, "logps/chosen": -387.0, "logps/rejected": -468.6000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.9482421875, "rewards/margins": 9.975000381469727, "rewards/rejected": -11.921875, "step": 6110 }, { "epoch": 3.2261465471797575, "grad_norm": 2.5301599711709764, "learning_rate": 1.9359515023721665e-07, "logits/chosen": -0.5010986328125, "logits/rejected": -0.62890625, "logps/chosen": -387.6499938964844, "logps/rejected": -433.8999938964844, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.7301146984100342, "rewards/margins": 9.493749618530273, "rewards/rejected": -11.221875190734863, "step": 6120 }, { "epoch": 3.231418028465999, "grad_norm": 0.13193240442986645, "learning_rate": 1.9227727991565628e-07, "logits/chosen": -0.3388915956020355, "logits/rejected": -0.6922851800918579, "logps/chosen": -427.29998779296875, "logps/rejected": -495.29998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.8859374523162842, "rewards/margins": 9.959375381469727, "rewards/rejected": -11.837499618530273, "step": 6130 }, { "epoch": 3.2366895097522406, "grad_norm": 1.1536380179297319, "learning_rate": 1.9095940959409592e-07, "logits/chosen": -0.45782470703125, "logits/rejected": -0.736279308795929, "logps/chosen": -384.3999938964844, "logps/rejected": -438.79998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.2194457054138184, "rewards/margins": 9.6953125, "rewards/rejected": -11.90625, "step": 6140 }, { "epoch": 3.2419609910384817, "grad_norm": 1.5971985361997407, "learning_rate": 1.8964153927253557e-07, "logits/chosen": -0.4181884825229645, "logits/rejected": -0.558911144733429, "logps/chosen": -371.25, "logps/rejected": -487.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.4320311546325684, "rewards/margins": 9.925000190734863, "rewards/rejected": -12.34375, "step": 6150 }, { "epoch": 3.2472324723247232, "grad_norm": 0.8037019347133407, "learning_rate": 1.8832366895097522e-07, "logits/chosen": -0.517199695110321, "logits/rejected": -0.656494140625, "logps/chosen": -386.04998779296875, "logps/rejected": -466.3999938964844, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.2857298851013184, "rewards/margins": 9.856249809265137, "rewards/rejected": -12.137499809265137, "step": 6160 }, { "epoch": 3.252503953610965, "grad_norm": 0.1317473158598181, "learning_rate": 1.8700579862941484e-07, "logits/chosen": -0.573046863079071, "logits/rejected": -0.655444324016571, "logps/chosen": -360.8500061035156, "logps/rejected": -438.5, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.134570360183716, "rewards/margins": 9.543749809265137, "rewards/rejected": -11.668749809265137, "step": 6170 }, { "epoch": 3.257775434897206, "grad_norm": 0.8916554007996711, "learning_rate": 1.856879283078545e-07, "logits/chosen": -0.43525391817092896, "logits/rejected": -0.7264159917831421, "logps/chosen": -388.75, "logps/rejected": -465.79998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.8336913585662842, "rewards/margins": 9.842187881469727, "rewards/rejected": -11.671875, "step": 6180 }, { "epoch": 3.2630469161834474, "grad_norm": 1.014307269595932, "learning_rate": 1.8437005798629416e-07, "logits/chosen": -0.549121081829071, "logits/rejected": -0.60595703125, "logps/chosen": -411.8500061035156, "logps/rejected": -503.3999938964844, "loss": 0.0057, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.622851610183716, "rewards/margins": 10.126562118530273, "rewards/rejected": -12.753125190734863, "step": 6190 }, { "epoch": 3.268318397469689, "grad_norm": 0.4154855091108102, "learning_rate": 1.8305218766473378e-07, "logits/chosen": -0.42966920137405396, "logits/rejected": -0.688671886920929, "logps/chosen": -419.6000061035156, "logps/rejected": -469.25, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.330273389816284, "rewards/margins": 9.731249809265137, "rewards/rejected": -12.059374809265137, "step": 6200 }, { "epoch": 3.2735898787559305, "grad_norm": 0.47700651650105247, "learning_rate": 1.8173431734317343e-07, "logits/chosen": -0.5870116949081421, "logits/rejected": -0.686779797077179, "logps/chosen": -358.20001220703125, "logps/rejected": -448.6000061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.3623046875, "rewards/margins": 9.643750190734863, "rewards/rejected": -11.993749618530273, "step": 6210 }, { "epoch": 3.2788613600421717, "grad_norm": 0.21999448205877592, "learning_rate": 1.8041644702161308e-07, "logits/chosen": -0.2741455137729645, "logits/rejected": -0.696484386920929, "logps/chosen": -377.95001220703125, "logps/rejected": -440.3999938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.303417921066284, "rewards/margins": 9.893750190734863, "rewards/rejected": -12.203125, "step": 6220 }, { "epoch": 3.284132841328413, "grad_norm": 0.4186764352826503, "learning_rate": 1.790985767000527e-07, "logits/chosen": -0.38564223051071167, "logits/rejected": -0.609619140625, "logps/chosen": -397.3999938964844, "logps/rejected": -449.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0423827171325684, "rewards/margins": 10.162500381469727, "rewards/rejected": -12.203125, "step": 6230 }, { "epoch": 3.2894043226146548, "grad_norm": 1.6723929239817805, "learning_rate": 1.7778070637849235e-07, "logits/chosen": -0.553027331829071, "logits/rejected": -0.7930663824081421, "logps/chosen": -422.29998779296875, "logps/rejected": -472.70001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.7222657203674316, "rewards/margins": 9.696874618530273, "rewards/rejected": -12.415624618530273, "step": 6240 }, { "epoch": 3.2946758039008963, "grad_norm": 0.22909300233010463, "learning_rate": 1.76462836056932e-07, "logits/chosen": -0.38282471895217896, "logits/rejected": -0.759960949420929, "logps/chosen": -348.3999938964844, "logps/rejected": -448.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.957617163658142, "rewards/margins": 10.126562118530273, "rewards/rejected": -12.081250190734863, "step": 6250 }, { "epoch": 3.2999472851871374, "grad_norm": 0.872988681188157, "learning_rate": 1.7514496573537164e-07, "logits/chosen": -0.255645751953125, "logits/rejected": -0.6510009765625, "logps/chosen": -410.3999938964844, "logps/rejected": -462.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.14111328125, "rewards/margins": 9.673437118530273, "rewards/rejected": -11.809374809265137, "step": 6260 }, { "epoch": 3.305218766473379, "grad_norm": 1.691161944499416, "learning_rate": 1.7382709541381126e-07, "logits/chosen": -0.36213380098342896, "logits/rejected": -0.731518566608429, "logps/chosen": -403.70001220703125, "logps/rejected": -441.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.8513672351837158, "rewards/margins": 9.920312881469727, "rewards/rejected": -11.778124809265137, "step": 6270 }, { "epoch": 3.3104902477596205, "grad_norm": 0.42196998105745154, "learning_rate": 1.725092250922509e-07, "logits/chosen": -0.46131592988967896, "logits/rejected": -0.6993163824081421, "logps/chosen": -398.75, "logps/rejected": -458.79998779296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.1197266578674316, "rewards/margins": 9.973437309265137, "rewards/rejected": -12.084375381469727, "step": 6280 }, { "epoch": 3.315761729045862, "grad_norm": 0.7384584761186861, "learning_rate": 1.7119135477069056e-07, "logits/chosen": -0.40861815214157104, "logits/rejected": -0.825390636920929, "logps/chosen": -382.70001220703125, "logps/rejected": -445.20001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.857031226158142, "rewards/margins": 9.917187690734863, "rewards/rejected": -11.771875381469727, "step": 6290 }, { "epoch": 3.321033210332103, "grad_norm": 0.32745001364805676, "learning_rate": 1.6987348444913018e-07, "logits/chosen": -0.24766235053539276, "logits/rejected": -0.685107409954071, "logps/chosen": -411.54998779296875, "logps/rejected": -453.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.661376953125, "rewards/margins": 10.107812881469727, "rewards/rejected": -11.765625, "step": 6300 }, { "epoch": 3.3263046916183447, "grad_norm": 0.8284561290130883, "learning_rate": 1.6855561412756983e-07, "logits/chosen": -0.3520263731479645, "logits/rejected": -0.6046386957168579, "logps/chosen": -374.95001220703125, "logps/rejected": -445.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.610156297683716, "rewards/margins": 9.703125, "rewards/rejected": -12.306249618530273, "step": 6310 }, { "epoch": 3.3315761729045863, "grad_norm": 0.9509873653266253, "learning_rate": 1.6723774380600947e-07, "logits/chosen": -0.45728760957717896, "logits/rejected": -0.7132323980331421, "logps/chosen": -388.3500061035156, "logps/rejected": -449.3999938964844, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.5302734375, "rewards/margins": 9.578125, "rewards/rejected": -12.103124618530273, "step": 6320 }, { "epoch": 3.3368476541908274, "grad_norm": 4.040223528703668, "learning_rate": 1.6591987348444915e-07, "logits/chosen": -0.49132078886032104, "logits/rejected": -0.81494140625, "logps/chosen": -411.20001220703125, "logps/rejected": -456.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0595703125, "rewards/margins": 10.064062118530273, "rewards/rejected": -12.121874809265137, "step": 6330 }, { "epoch": 3.342119135477069, "grad_norm": 0.399578118009273, "learning_rate": 1.6460200316288877e-07, "logits/chosen": -0.3625854551792145, "logits/rejected": -0.69287109375, "logps/chosen": -416.70001220703125, "logps/rejected": -440.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9915039539337158, "rewards/margins": 9.3671875, "rewards/rejected": -11.368749618530273, "step": 6340 }, { "epoch": 3.3473906167633105, "grad_norm": 0.6215514901370964, "learning_rate": 1.6328413284132842e-07, "logits/chosen": -0.21589355170726776, "logits/rejected": -0.584643542766571, "logps/chosen": -398.95001220703125, "logps/rejected": -419.95001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.1292967796325684, "rewards/margins": 9.482812881469727, "rewards/rejected": -11.606249809265137, "step": 6350 }, { "epoch": 3.352662098049552, "grad_norm": 2.7633453183991317, "learning_rate": 1.6196626251976806e-07, "logits/chosen": -0.2530151307582855, "logits/rejected": -0.5644775629043579, "logps/chosen": -393.04998779296875, "logps/rejected": -435.1000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.023242235183716, "rewards/margins": 9.296875, "rewards/rejected": -11.318750381469727, "step": 6360 }, { "epoch": 3.357933579335793, "grad_norm": 0.6749559163860243, "learning_rate": 1.6064839219820768e-07, "logits/chosen": -0.502758800983429, "logits/rejected": -0.7569335699081421, "logps/chosen": -361.75, "logps/rejected": -455.75, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.2278809547424316, "rewards/margins": 10.231249809265137, "rewards/rejected": -12.453125, "step": 6370 }, { "epoch": 3.3632050606220347, "grad_norm": 4.432560628456868, "learning_rate": 1.5933052187664733e-07, "logits/chosen": -0.4859252870082855, "logits/rejected": -0.6512451171875, "logps/chosen": -405.8500061035156, "logps/rejected": -496.29998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.5101561546325684, "rewards/margins": 10.001562118530273, "rewards/rejected": -12.512499809265137, "step": 6380 }, { "epoch": 3.3684765419082763, "grad_norm": 12.469145637790273, "learning_rate": 1.5801265155508698e-07, "logits/chosen": -0.34101563692092896, "logits/rejected": -0.6341552734375, "logps/chosen": -345.1499938964844, "logps/rejected": -413.29998779296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.9765746593475342, "rewards/margins": 9.475000381469727, "rewards/rejected": -11.446874618530273, "step": 6390 }, { "epoch": 3.373748023194518, "grad_norm": 0.841739707872694, "learning_rate": 1.566947812335266e-07, "logits/chosen": -0.2554687559604645, "logits/rejected": -0.683056652545929, "logps/chosen": -404.5, "logps/rejected": -470.5, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.7488281726837158, "rewards/margins": 10.1171875, "rewards/rejected": -11.865625381469727, "step": 6400 }, { "epoch": 3.379019504480759, "grad_norm": 1.372609823342723, "learning_rate": 1.5537691091196625e-07, "logits/chosen": -0.5059432983398438, "logits/rejected": -0.5868896245956421, "logps/chosen": -357.29998779296875, "logps/rejected": -448.75, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.265869140625, "rewards/margins": 9.520312309265137, "rewards/rejected": -11.774999618530273, "step": 6410 }, { "epoch": 3.3842909857670005, "grad_norm": 1.9562292814380065, "learning_rate": 1.540590405904059e-07, "logits/chosen": -0.5151306390762329, "logits/rejected": -0.754101574420929, "logps/chosen": -387.79998779296875, "logps/rejected": -431.25, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.212109327316284, "rewards/margins": 9.393750190734863, "rewards/rejected": -11.609375, "step": 6420 }, { "epoch": 3.389562467053242, "grad_norm": 0.36847276954394875, "learning_rate": 1.5274117026884554e-07, "logits/chosen": -0.4415527284145355, "logits/rejected": -0.594775378704071, "logps/chosen": -338.3500061035156, "logps/rejected": -430.29998779296875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.1626954078674316, "rewards/margins": 9.657812118530273, "rewards/rejected": -11.821874618530273, "step": 6430 }, { "epoch": 3.3948339483394836, "grad_norm": 0.42811124746898704, "learning_rate": 1.5142329994728516e-07, "logits/chosen": -0.4164062440395355, "logits/rejected": -0.6527343988418579, "logps/chosen": -372.8999938964844, "logps/rejected": -442.70001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.862695336341858, "rewards/margins": 9.765625, "rewards/rejected": -11.634374618530273, "step": 6440 }, { "epoch": 3.4001054296257247, "grad_norm": 1.6757290983732689, "learning_rate": 1.501054296257248e-07, "logits/chosen": -0.4200439453125, "logits/rejected": -0.6731811761856079, "logps/chosen": -365.54998779296875, "logps/rejected": -443.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0000977516174316, "rewards/margins": 9.800000190734863, "rewards/rejected": -11.803125381469727, "step": 6450 }, { "epoch": 3.4053769109119663, "grad_norm": 0.7862414710978513, "learning_rate": 1.4878755930416446e-07, "logits/chosen": -0.3541015684604645, "logits/rejected": -0.6087707281112671, "logps/chosen": -380.8500061035156, "logps/rejected": -441.29998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.61102294921875, "rewards/margins": 9.4375, "rewards/rejected": -11.043749809265137, "step": 6460 }, { "epoch": 3.410648392198208, "grad_norm": 0.4311075194161271, "learning_rate": 1.474696889826041e-07, "logits/chosen": -0.6109374761581421, "logits/rejected": -0.748339831829071, "logps/chosen": -419.8500061035156, "logps/rejected": -448.29998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.1285157203674316, "rewards/margins": 9.912500381469727, "rewards/rejected": -12.046875, "step": 6470 }, { "epoch": 3.415919873484449, "grad_norm": 2.268725291757131, "learning_rate": 1.4615181866104375e-07, "logits/chosen": -0.4857421815395355, "logits/rejected": -0.7013794183731079, "logps/chosen": -384.8999938964844, "logps/rejected": -460.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.0826172828674316, "rewards/margins": 10.050000190734863, "rewards/rejected": -12.125, "step": 6480 }, { "epoch": 3.4211913547706905, "grad_norm": 10.750606969180902, "learning_rate": 1.448339483394834e-07, "logits/chosen": -0.20429687201976776, "logits/rejected": -0.5967041254043579, "logps/chosen": -437.5, "logps/rejected": -486.5, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.8727538585662842, "rewards/margins": 9.831250190734863, "rewards/rejected": -11.712499618530273, "step": 6490 }, { "epoch": 3.426462836056932, "grad_norm": 0.45070284559816354, "learning_rate": 1.4351607801792305e-07, "logits/chosen": -0.394287109375, "logits/rejected": -0.626708984375, "logps/chosen": -375.95001220703125, "logps/rejected": -465.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.313061475753784, "rewards/margins": 10.057812690734863, "rewards/rejected": -12.368749618530273, "step": 6500 }, { "epoch": 3.4317343173431736, "grad_norm": 0.5306956472217994, "learning_rate": 1.4219820769636267e-07, "logits/chosen": -0.266091912984848, "logits/rejected": -0.661816418170929, "logps/chosen": -397.20001220703125, "logps/rejected": -447.54998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6707031726837158, "rewards/margins": 10.379687309265137, "rewards/rejected": -12.046875, "step": 6510 }, { "epoch": 3.4370057986294147, "grad_norm": 1.8686471305842327, "learning_rate": 1.4088033737480232e-07, "logits/chosen": -0.28877562284469604, "logits/rejected": -0.8583984375, "logps/chosen": -417.75, "logps/rejected": -482.6000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.330371141433716, "rewards/margins": 10.037500381469727, "rewards/rejected": -12.359375, "step": 6520 }, { "epoch": 3.4422772799156562, "grad_norm": 0.4226960554857179, "learning_rate": 1.3956246705324197e-07, "logits/chosen": -0.23509827256202698, "logits/rejected": -0.6224120855331421, "logps/chosen": -448.1499938964844, "logps/rejected": -474.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.409374952316284, "rewards/margins": 10.456250190734863, "rewards/rejected": -12.862500190734863, "step": 6530 }, { "epoch": 3.447548761201898, "grad_norm": 0.18273602482655255, "learning_rate": 1.382445967316816e-07, "logits/chosen": -0.46809157729148865, "logits/rejected": -0.7088867425918579, "logps/chosen": -417.1000061035156, "logps/rejected": -471.70001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.0443358421325684, "rewards/margins": 9.864062309265137, "rewards/rejected": -11.903124809265137, "step": 6540 }, { "epoch": 3.4528202424881393, "grad_norm": 24.30918267582434, "learning_rate": 1.3692672641012123e-07, "logits/chosen": -0.3657241761684418, "logits/rejected": -0.831787109375, "logps/chosen": -415.8999938964844, "logps/rejected": -469.20001220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.0674805641174316, "rewards/margins": 10.431249618530273, "rewards/rejected": -12.503125190734863, "step": 6550 }, { "epoch": 3.4580917237743805, "grad_norm": 0.1559789166673325, "learning_rate": 1.3560885608856088e-07, "logits/chosen": -0.6226806640625, "logits/rejected": -0.821582019329071, "logps/chosen": -412.8999938964844, "logps/rejected": -469.20001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7701172828674316, "rewards/margins": 10.137499809265137, "rewards/rejected": -12.918749809265137, "step": 6560 }, { "epoch": 3.463363205060622, "grad_norm": 0.8706336723456167, "learning_rate": 1.342909857670005e-07, "logits/chosen": -0.4619140625, "logits/rejected": -0.72528076171875, "logps/chosen": -342.8999938964844, "logps/rejected": -441.0, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.922460913658142, "rewards/margins": 10.149999618530273, "rewards/rejected": -12.074999809265137, "step": 6570 }, { "epoch": 3.4686346863468636, "grad_norm": 1.101138496043216, "learning_rate": 1.3297311544544015e-07, "logits/chosen": -0.4381164610385895, "logits/rejected": -0.717456042766571, "logps/chosen": -437.20001220703125, "logps/rejected": -454.0, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -2.4642577171325684, "rewards/margins": 9.989062309265137, "rewards/rejected": -12.462499618530273, "step": 6580 }, { "epoch": 3.473906167633105, "grad_norm": 4.9484382960559525, "learning_rate": 1.316552451238798e-07, "logits/chosen": -0.4342041015625, "logits/rejected": -0.777099609375, "logps/chosen": -393.1000061035156, "logps/rejected": -467.6000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.5992188453674316, "rewards/margins": 10.240625381469727, "rewards/rejected": -12.850000381469727, "step": 6590 }, { "epoch": 3.479177648919346, "grad_norm": 0.3728892873163608, "learning_rate": 1.3033737480231945e-07, "logits/chosen": -0.3626342713832855, "logits/rejected": -0.728320300579071, "logps/chosen": -414.70001220703125, "logps/rejected": -437.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.267285108566284, "rewards/margins": 9.931249618530273, "rewards/rejected": -12.209375381469727, "step": 6600 }, { "epoch": 3.4844491302055878, "grad_norm": 15.564588071903374, "learning_rate": 1.290195044807591e-07, "logits/chosen": -0.3949951231479645, "logits/rejected": -0.6561034917831421, "logps/chosen": -403.29998779296875, "logps/rejected": -459.1000061035156, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.622265577316284, "rewards/margins": 9.771875381469727, "rewards/rejected": -12.399999618530273, "step": 6610 }, { "epoch": 3.4897206114918293, "grad_norm": 0.2078898190112019, "learning_rate": 1.2770163415919874e-07, "logits/chosen": -0.3092285096645355, "logits/rejected": -0.775097668170929, "logps/chosen": -354.3500061035156, "logps/rejected": -412.79998779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.366894483566284, "rewards/margins": 9.7734375, "rewards/rejected": -12.143750190734863, "step": 6620 }, { "epoch": 3.4949920927780704, "grad_norm": 1.328975780848027, "learning_rate": 1.263837638376384e-07, "logits/chosen": -0.4516235291957855, "logits/rejected": -0.5465332269668579, "logps/chosen": -372.5, "logps/rejected": -474.1000061035156, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.640625, "rewards/margins": 9.756250381469727, "rewards/rejected": -12.390625, "step": 6630 }, { "epoch": 3.500263574064312, "grad_norm": 0.5989079408251893, "learning_rate": 1.25065893516078e-07, "logits/chosen": -0.3038574159145355, "logits/rejected": -0.6701294183731079, "logps/chosen": -354.6499938964844, "logps/rejected": -425.70001220703125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.700097680091858, "rewards/margins": 9.639062881469727, "rewards/rejected": -11.337499618530273, "step": 6640 }, { "epoch": 3.5055350553505535, "grad_norm": 3.730955731702575, "learning_rate": 1.2374802319451766e-07, "logits/chosen": -0.3652710020542145, "logits/rejected": -0.675244152545929, "logps/chosen": -379.1000061035156, "logps/rejected": -459.0, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.0896239280700684, "rewards/margins": 10.065625190734863, "rewards/rejected": -12.146875381469727, "step": 6650 }, { "epoch": 3.510806536636795, "grad_norm": 3.21089662766748, "learning_rate": 1.224301528729573e-07, "logits/chosen": -0.565136730670929, "logits/rejected": -0.7850097417831421, "logps/chosen": -362.79998779296875, "logps/rejected": -432.70001220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.272534132003784, "rewards/margins": 9.665624618530273, "rewards/rejected": -11.940625190734863, "step": 6660 }, { "epoch": 3.5160780179230366, "grad_norm": 3.841834997197911, "learning_rate": 1.2111228255139693e-07, "logits/chosen": -0.3339172303676605, "logits/rejected": -0.5747314691543579, "logps/chosen": -374.95001220703125, "logps/rejected": -428.1000061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.1001954078674316, "rewards/margins": 9.706250190734863, "rewards/rejected": -11.800000190734863, "step": 6670 }, { "epoch": 3.5213494992092778, "grad_norm": 0.4362824517593454, "learning_rate": 1.1979441222983657e-07, "logits/chosen": -0.4371581971645355, "logits/rejected": -0.653759777545929, "logps/chosen": -379.8500061035156, "logps/rejected": -523.2999877929688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.583203077316284, "rewards/margins": 9.934374809265137, "rewards/rejected": -12.509374618530273, "step": 6680 }, { "epoch": 3.5266209804955193, "grad_norm": 0.35047484600263934, "learning_rate": 1.1847654190827622e-07, "logits/chosen": -0.5473877191543579, "logits/rejected": -0.753466784954071, "logps/chosen": -339.6000061035156, "logps/rejected": -437.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.1031250953674316, "rewards/margins": 10.028124809265137, "rewards/rejected": -12.134374618530273, "step": 6690 }, { "epoch": 3.5318924617817604, "grad_norm": 0.6718737454197555, "learning_rate": 1.1715867158671585e-07, "logits/chosen": -0.5663086175918579, "logits/rejected": -0.7984374761581421, "logps/chosen": -364.29998779296875, "logps/rejected": -425.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.466394066810608, "rewards/margins": 9.634374618530273, "rewards/rejected": -11.106249809265137, "step": 6700 }, { "epoch": 3.537163943068002, "grad_norm": 0.3660091095403202, "learning_rate": 1.158408012651555e-07, "logits/chosen": -0.43218994140625, "logits/rejected": -0.67877197265625, "logps/chosen": -363.5, "logps/rejected": -482.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.285845994949341, "rewards/margins": 9.964062690734863, "rewards/rejected": -12.251562118530273, "step": 6710 }, { "epoch": 3.5424354243542435, "grad_norm": 0.49281812396220603, "learning_rate": 1.1452293094359515e-07, "logits/chosen": -0.38646239042282104, "logits/rejected": -0.703173816204071, "logps/chosen": -410.1000061035156, "logps/rejected": -482.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.4649415016174316, "rewards/margins": 10.100000381469727, "rewards/rejected": -12.559374809265137, "step": 6720 }, { "epoch": 3.547706905640485, "grad_norm": 0.12977104588449018, "learning_rate": 1.1320506062203478e-07, "logits/chosen": -0.46723634004592896, "logits/rejected": -0.7063629031181335, "logps/chosen": -417.6000061035156, "logps/rejected": -474.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.1534667015075684, "rewards/margins": 9.90625, "rewards/rejected": -12.056249618530273, "step": 6730 }, { "epoch": 3.5529783869267266, "grad_norm": 0.8269300875990225, "learning_rate": 1.1188719030047443e-07, "logits/chosen": -0.49029541015625, "logits/rejected": -0.727246105670929, "logps/chosen": -412.70001220703125, "logps/rejected": -468.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.303906202316284, "rewards/margins": 10.123437881469727, "rewards/rejected": -12.415624618530273, "step": 6740 }, { "epoch": 3.5582498682129677, "grad_norm": 0.39238030600847873, "learning_rate": 1.1056931997891407e-07, "logits/chosen": -0.3224243223667145, "logits/rejected": -0.629931628704071, "logps/chosen": -370.95001220703125, "logps/rejected": -429.5, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.997705101966858, "rewards/margins": 9.471875190734863, "rewards/rejected": -11.465624809265137, "step": 6750 }, { "epoch": 3.5635213494992093, "grad_norm": 0.8547230719759337, "learning_rate": 1.0925144965735371e-07, "logits/chosen": -0.47435301542282104, "logits/rejected": -0.7099761962890625, "logps/chosen": -388.25, "logps/rejected": -481.70001220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.143261671066284, "rewards/margins": 10.234375, "rewards/rejected": -12.384374618530273, "step": 6760 }, { "epoch": 3.568792830785451, "grad_norm": 0.3646511389907335, "learning_rate": 1.0793357933579335e-07, "logits/chosen": -0.4745239317417145, "logits/rejected": -0.67333984375, "logps/chosen": -389.9750061035156, "logps/rejected": -466.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.1380858421325684, "rewards/margins": 9.871874809265137, "rewards/rejected": -11.996874809265137, "step": 6770 }, { "epoch": 3.574064312071692, "grad_norm": 2.3704175573857054, "learning_rate": 1.0661570901423298e-07, "logits/chosen": -0.47233885526657104, "logits/rejected": -0.7242187261581421, "logps/chosen": -357.70001220703125, "logps/rejected": -428.29998779296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.858862280845642, "rewards/margins": 9.629687309265137, "rewards/rejected": -11.484375, "step": 6780 }, { "epoch": 3.5793357933579335, "grad_norm": 1.353993729808974, "learning_rate": 1.0529783869267264e-07, "logits/chosen": -0.4015869200229645, "logits/rejected": -0.7257324457168579, "logps/chosen": -426.8500061035156, "logps/rejected": -449.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.1487059593200684, "rewards/margins": 10.140625, "rewards/rejected": -12.271875381469727, "step": 6790 }, { "epoch": 3.584607274644175, "grad_norm": 1.1268100208092744, "learning_rate": 1.0397996837111228e-07, "logits/chosen": -0.50762939453125, "logits/rejected": -0.7696288824081421, "logps/chosen": -372.1499938964844, "logps/rejected": -404.3500061035156, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.9983398914337158, "rewards/margins": 9.259374618530273, "rewards/rejected": -11.262499809265137, "step": 6800 }, { "epoch": 3.5898787559304166, "grad_norm": 1.7476734272294063, "learning_rate": 1.0266209804955192e-07, "logits/chosen": -0.4217773377895355, "logits/rejected": -0.569165050983429, "logps/chosen": -418.3500061035156, "logps/rejected": -462.5, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.020214796066284, "rewards/margins": 9.8359375, "rewards/rejected": -11.846875190734863, "step": 6810 }, { "epoch": 3.5951502372166577, "grad_norm": 5.0260343287951885, "learning_rate": 1.0134422772799156e-07, "logits/chosen": -0.4126220643520355, "logits/rejected": -0.651806652545929, "logps/chosen": -381.75, "logps/rejected": -480.0, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.1075196266174316, "rewards/margins": 10.040624618530273, "rewards/rejected": -12.134374618530273, "step": 6820 }, { "epoch": 3.6004217185028993, "grad_norm": 1.0791227770057061, "learning_rate": 1.000263574064312e-07, "logits/chosen": -0.4924072325229645, "logits/rejected": -0.6395508050918579, "logps/chosen": -347.04998779296875, "logps/rejected": -442.29998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.829980492591858, "rewards/margins": 10.040624618530273, "rewards/rejected": -11.868749618530273, "step": 6830 }, { "epoch": 3.605693199789141, "grad_norm": 1.1911561633260344, "learning_rate": 9.870848708487084e-08, "logits/chosen": -0.2555175721645355, "logits/rejected": -0.802490234375, "logps/chosen": -429.04998779296875, "logps/rejected": -431.79998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.79541015625, "rewards/margins": 10.471875190734863, "rewards/rejected": -12.259374618530273, "step": 6840 }, { "epoch": 3.610964681075382, "grad_norm": 6.164733680925296, "learning_rate": 9.739061676331048e-08, "logits/chosen": -0.469482421875, "logits/rejected": -0.600720226764679, "logps/chosen": -340.0, "logps/rejected": -421.0, "loss": 0.0286, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.132031202316284, "rewards/margins": 9.3046875, "rewards/rejected": -11.434374809265137, "step": 6850 }, { "epoch": 3.6162361623616235, "grad_norm": 0.295609847272226, "learning_rate": 9.607274644175014e-08, "logits/chosen": -0.5665649175643921, "logits/rejected": -0.9346679449081421, "logps/chosen": -399.1000061035156, "logps/rejected": -441.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.9420897960662842, "rewards/margins": 10.334375381469727, "rewards/rejected": -12.271875381469727, "step": 6860 }, { "epoch": 3.621507643647865, "grad_norm": 0.27660334679648774, "learning_rate": 9.475487612018977e-08, "logits/chosen": -0.504589855670929, "logits/rejected": -0.708203136920929, "logps/chosen": -367.8999938964844, "logps/rejected": -418.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.5850341320037842, "rewards/margins": 9.935937881469727, "rewards/rejected": -11.515625, "step": 6870 }, { "epoch": 3.6267791249341066, "grad_norm": 0.569414805163528, "learning_rate": 9.343700579862942e-08, "logits/chosen": -0.46424561738967896, "logits/rejected": -0.7645508050918579, "logps/chosen": -391.6499938964844, "logps/rejected": -438.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6790039539337158, "rewards/margins": 9.928125381469727, "rewards/rejected": -11.603124618530273, "step": 6880 }, { "epoch": 3.632050606220348, "grad_norm": 0.16619124950534364, "learning_rate": 9.211913547706905e-08, "logits/chosen": -0.661425769329071, "logits/rejected": -0.7289673089981079, "logps/chosen": -366.3500061035156, "logps/rejected": -454.54998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.3412108421325684, "rewards/margins": 9.323437690734863, "rewards/rejected": -11.662500381469727, "step": 6890 }, { "epoch": 3.6373220875065893, "grad_norm": 3.4339423176021286, "learning_rate": 9.080126515550869e-08, "logits/chosen": -0.2827392518520355, "logits/rejected": -0.596752941608429, "logps/chosen": -372.1000061035156, "logps/rejected": -426.8999938964844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.827246069908142, "rewards/margins": 9.620312690734863, "rewards/rejected": -11.449999809265137, "step": 6900 }, { "epoch": 3.642593568792831, "grad_norm": 0.7391917877334419, "learning_rate": 8.948339483394833e-08, "logits/chosen": -0.44366455078125, "logits/rejected": -0.7123047113418579, "logps/chosen": -410.3500061035156, "logps/rejected": -444.8999938964844, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.686621069908142, "rewards/margins": 9.571874618530273, "rewards/rejected": -11.253125190734863, "step": 6910 }, { "epoch": 3.6478650500790724, "grad_norm": 0.4730806350001132, "learning_rate": 8.816552451238797e-08, "logits/chosen": -0.614697277545929, "logits/rejected": -0.838854968547821, "logps/chosen": -397.45001220703125, "logps/rejected": -469.79998779296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.1812500953674316, "rewards/margins": 9.792187690734863, "rewards/rejected": -11.975000381469727, "step": 6920 }, { "epoch": 3.6531365313653135, "grad_norm": 0.643712872408688, "learning_rate": 8.684765419082763e-08, "logits/chosen": -0.5004943609237671, "logits/rejected": -0.841357409954071, "logps/chosen": -368.3500061035156, "logps/rejected": -431.3999938964844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.1985840797424316, "rewards/margins": 9.809374809265137, "rewards/rejected": -12.003125190734863, "step": 6930 }, { "epoch": 3.658408012651555, "grad_norm": 1.3332856750302577, "learning_rate": 8.552978386926726e-08, "logits/chosen": -0.255178838968277, "logits/rejected": -0.6932617425918579, "logps/chosen": -396.45001220703125, "logps/rejected": -467.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.241015672683716, "rewards/margins": 9.845312118530273, "rewards/rejected": -12.081250190734863, "step": 6940 }, { "epoch": 3.6636794939377966, "grad_norm": 1.9244723238472388, "learning_rate": 8.42119135477069e-08, "logits/chosen": -0.3712097108364105, "logits/rejected": -0.4822143614292145, "logps/chosen": -430.54998779296875, "logps/rejected": -513.5, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.8849608898162842, "rewards/margins": 10.270312309265137, "rewards/rejected": -12.15625, "step": 6950 }, { "epoch": 3.668950975224038, "grad_norm": 1.129122576856452, "learning_rate": 8.289404322614655e-08, "logits/chosen": -0.583813488483429, "logits/rejected": -0.801220715045929, "logps/chosen": -371.3999938964844, "logps/rejected": -434.79998779296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.3968749046325684, "rewards/margins": 9.779687881469727, "rewards/rejected": -12.175000190734863, "step": 6960 }, { "epoch": 3.6742224565102792, "grad_norm": 2.152392953618149, "learning_rate": 8.157617290458618e-08, "logits/chosen": -0.5975341796875, "logits/rejected": -0.883496105670929, "logps/chosen": -381.29998779296875, "logps/rejected": -462.79998779296875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.7356688976287842, "rewards/margins": 10.125, "rewards/rejected": -11.875, "step": 6970 }, { "epoch": 3.679493937796521, "grad_norm": 0.28305515113020707, "learning_rate": 8.025830258302583e-08, "logits/chosen": -0.39910888671875, "logits/rejected": -0.580126941204071, "logps/chosen": -400.5, "logps/rejected": -469.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.6366455554962158, "rewards/margins": 10.068750381469727, "rewards/rejected": -11.71875, "step": 6980 }, { "epoch": 3.6847654190827623, "grad_norm": 0.27751813119295254, "learning_rate": 7.894043226146546e-08, "logits/chosen": -0.5900634527206421, "logits/rejected": -0.8426758050918579, "logps/chosen": -411.8999938964844, "logps/rejected": -482.20001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.3248047828674316, "rewards/margins": 9.846875190734863, "rewards/rejected": -12.171875, "step": 6990 }, { "epoch": 3.6900369003690034, "grad_norm": 0.31173774272708554, "learning_rate": 7.762256193990511e-08, "logits/chosen": -0.45753175020217896, "logits/rejected": -0.7596679925918579, "logps/chosen": -463.1499938964844, "logps/rejected": -498.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.178759813308716, "rewards/margins": 10.125, "rewards/rejected": -12.3125, "step": 7000 }, { "epoch": 3.695308381655245, "grad_norm": 5.2405721019803, "learning_rate": 7.630469161834476e-08, "logits/chosen": -0.5147705078125, "logits/rejected": -0.757031261920929, "logps/chosen": -428.6499938964844, "logps/rejected": -463.20001220703125, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8348388671875, "rewards/margins": 10.017187118530273, "rewards/rejected": -11.862500190734863, "step": 7010 }, { "epoch": 3.7005798629414866, "grad_norm": 2.7006827498375325, "learning_rate": 7.498682129678439e-08, "logits/chosen": -0.535534679889679, "logits/rejected": -0.710742175579071, "logps/chosen": -382.6499938964844, "logps/rejected": -482.6000061035156, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -1.87890625, "rewards/margins": 10.5, "rewards/rejected": -12.371874809265137, "step": 7020 }, { "epoch": 3.705851344227728, "grad_norm": 3.195547651872851, "learning_rate": 7.366895097522404e-08, "logits/chosen": -0.3666748106479645, "logits/rejected": -0.621826171875, "logps/chosen": -396.3500061035156, "logps/rejected": -429.8999938964844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.9656250476837158, "rewards/margins": 9.703125, "rewards/rejected": -11.675000190734863, "step": 7030 }, { "epoch": 3.7111228255139697, "grad_norm": 1.687288078389582, "learning_rate": 7.235108065366367e-08, "logits/chosen": -0.33317869901657104, "logits/rejected": -0.5948486328125, "logps/chosen": -425.3999938964844, "logps/rejected": -459.29998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6598632335662842, "rewards/margins": 9.984375, "rewards/rejected": -11.640625, "step": 7040 }, { "epoch": 3.7163943068002108, "grad_norm": 2.769390033855531, "learning_rate": 7.103321033210331e-08, "logits/chosen": -0.40971678495407104, "logits/rejected": -0.5644989013671875, "logps/chosen": -384.1000061035156, "logps/rejected": -439.75, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.9863770008087158, "rewards/margins": 9.2734375, "rewards/rejected": -11.270312309265137, "step": 7050 }, { "epoch": 3.7216657880864523, "grad_norm": 0.263577235849684, "learning_rate": 6.971534001054295e-08, "logits/chosen": -0.611035168170929, "logits/rejected": -0.638928234577179, "logps/chosen": -374.95001220703125, "logps/rejected": -465.70001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.8317382335662842, "rewards/margins": 9.535937309265137, "rewards/rejected": -11.378125190734863, "step": 7060 }, { "epoch": 3.726937269372694, "grad_norm": 1.4980686925676119, "learning_rate": 6.83974696889826e-08, "logits/chosen": -0.608203113079071, "logits/rejected": -0.6822265386581421, "logps/chosen": -357.95001220703125, "logps/rejected": -429.0, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.770898461341858, "rewards/margins": 9.506250381469727, "rewards/rejected": -11.279687881469727, "step": 7070 }, { "epoch": 3.732208750658935, "grad_norm": 1.6574600268356334, "learning_rate": 6.707959936742225e-08, "logits/chosen": -0.28959959745407104, "logits/rejected": -0.47662353515625, "logps/chosen": -406.20001220703125, "logps/rejected": -462.6000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.8528320789337158, "rewards/margins": 10.131250381469727, "rewards/rejected": -11.981249809265137, "step": 7080 }, { "epoch": 3.7374802319451765, "grad_norm": 0.22953076067483533, "learning_rate": 6.576172904586188e-08, "logits/chosen": -0.4365905821323395, "logits/rejected": -0.6231445074081421, "logps/chosen": -385.25, "logps/rejected": -494.6000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.9713866710662842, "rewards/margins": 10.253125190734863, "rewards/rejected": -12.231249809265137, "step": 7090 }, { "epoch": 3.742751713231418, "grad_norm": 3.149016278886963, "learning_rate": 6.444385872430153e-08, "logits/chosen": -0.630419909954071, "logits/rejected": -0.8523925542831421, "logps/chosen": -380.79998779296875, "logps/rejected": -497.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.2025389671325684, "rewards/margins": 10.746874809265137, "rewards/rejected": -12.946874618530273, "step": 7100 }, { "epoch": 3.7480231945176596, "grad_norm": 0.6543770483685156, "learning_rate": 6.312598840274117e-08, "logits/chosen": -0.4439453184604645, "logits/rejected": -0.824902355670929, "logps/chosen": -382.95001220703125, "logps/rejected": -477.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.35546875, "rewards/margins": 10.159375190734863, "rewards/rejected": -12.509374618530273, "step": 7110 }, { "epoch": 3.7532946758039007, "grad_norm": 1.178197110818575, "learning_rate": 6.180811808118081e-08, "logits/chosen": -0.560717761516571, "logits/rejected": -0.712390124797821, "logps/chosen": -406.1000061035156, "logps/rejected": -500.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9307892322540283, "rewards/margins": 9.870312690734863, "rewards/rejected": -11.806249618530273, "step": 7120 }, { "epoch": 3.7585661570901423, "grad_norm": 1.0747064904821215, "learning_rate": 6.049024775962045e-08, "logits/chosen": -0.3701171875, "logits/rejected": -0.6973632574081421, "logps/chosen": -431.25, "logps/rejected": -481.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.143847703933716, "rewards/margins": 10.059374809265137, "rewards/rejected": -12.221875190734863, "step": 7130 }, { "epoch": 3.763837638376384, "grad_norm": 0.34935341349616206, "learning_rate": 5.917237743806009e-08, "logits/chosen": -0.4850097596645355, "logits/rejected": -0.733105480670929, "logps/chosen": -367.8999938964844, "logps/rejected": -467.8999938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.1474609375, "rewards/margins": 10.421875, "rewards/rejected": -12.571874618530273, "step": 7140 }, { "epoch": 3.769109119662625, "grad_norm": 0.8458129568663264, "learning_rate": 5.7854507116499736e-08, "logits/chosen": -0.6171859502792358, "logits/rejected": -0.7782226800918579, "logps/chosen": -398.1000061035156, "logps/rejected": -454.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.814599633216858, "rewards/margins": 9.7890625, "rewards/rejected": -11.598437309265137, "step": 7150 }, { "epoch": 3.7743806009488665, "grad_norm": 0.35435065666767496, "learning_rate": 5.653663679493938e-08, "logits/chosen": -0.530078113079071, "logits/rejected": -0.6422363519668579, "logps/chosen": -395.1000061035156, "logps/rejected": -489.70001220703125, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4554686546325684, "rewards/margins": 10.103124618530273, "rewards/rejected": -12.550000190734863, "step": 7160 }, { "epoch": 3.779652082235108, "grad_norm": 0.9914061081260559, "learning_rate": 5.521876647337902e-08, "logits/chosen": -0.46769410371780396, "logits/rejected": -0.66015625, "logps/chosen": -416.5, "logps/rejected": -497.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.8023438453674316, "rewards/margins": 9.693750381469727, "rewards/rejected": -12.493749618530273, "step": 7170 }, { "epoch": 3.7849235635213496, "grad_norm": 2.560484543263831, "learning_rate": 5.390089615181866e-08, "logits/chosen": -0.507922351360321, "logits/rejected": -0.7574707269668579, "logps/chosen": -368.75, "logps/rejected": -411.6000061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.942407250404358, "rewards/margins": 9.971875190734863, "rewards/rejected": -11.921875, "step": 7180 }, { "epoch": 3.790195044807591, "grad_norm": 1.0919788491878735, "learning_rate": 5.25830258302583e-08, "logits/chosen": -0.5635010004043579, "logits/rejected": -0.674328625202179, "logps/chosen": -415.75, "logps/rejected": -478.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.7099609375, "rewards/margins": 9.981249809265137, "rewards/rejected": -12.703125, "step": 7190 }, { "epoch": 3.7954665260938323, "grad_norm": 1.6482717251370738, "learning_rate": 5.126515550869794e-08, "logits/chosen": -0.4725280702114105, "logits/rejected": -0.7957519292831421, "logps/chosen": -358.25, "logps/rejected": -442.5, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.4091796875, "rewards/margins": 9.884374618530273, "rewards/rejected": -12.296875, "step": 7200 }, { "epoch": 3.800738007380074, "grad_norm": 0.34071568689349413, "learning_rate": 4.994728518713758e-08, "logits/chosen": -0.48640745878219604, "logits/rejected": -0.6827148199081421, "logps/chosen": -409.29998779296875, "logps/rejected": -477.5, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.140625, "rewards/margins": 9.467187881469727, "rewards/rejected": -11.615625381469727, "step": 7210 }, { "epoch": 3.8060094886663154, "grad_norm": 0.5931888911681608, "learning_rate": 4.862941486557722e-08, "logits/chosen": -0.539794921875, "logits/rejected": -0.66162109375, "logps/chosen": -395.8500061035156, "logps/rejected": -473.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.2730469703674316, "rewards/margins": 10.024999618530273, "rewards/rejected": -12.296875, "step": 7220 }, { "epoch": 3.8112809699525565, "grad_norm": 0.24908123026724227, "learning_rate": 4.731154454401687e-08, "logits/chosen": -0.6122192144393921, "logits/rejected": -0.771240234375, "logps/chosen": -382.3999938964844, "logps/rejected": -424.6000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.7965819835662842, "rewards/margins": 10.090624809265137, "rewards/rejected": -11.884374618530273, "step": 7230 }, { "epoch": 3.816552451238798, "grad_norm": 0.4311849859747789, "learning_rate": 4.5993674222456505e-08, "logits/chosen": -0.533703625202179, "logits/rejected": -0.7334960699081421, "logps/chosen": -347.3500061035156, "logps/rejected": -413.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9376952648162842, "rewards/margins": 10.215624809265137, "rewards/rejected": -12.15625, "step": 7240 }, { "epoch": 3.8218239325250396, "grad_norm": 0.4472590618935205, "learning_rate": 4.4675803900896145e-08, "logits/chosen": -0.5512054562568665, "logits/rejected": -0.8602966070175171, "logps/chosen": -403.3999938964844, "logps/rejected": -432.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.970703125, "rewards/margins": 10.318750381469727, "rewards/rejected": -12.290624618530273, "step": 7250 }, { "epoch": 3.827095413811281, "grad_norm": 1.114253183350934, "learning_rate": 4.335793357933579e-08, "logits/chosen": -0.43879395723342896, "logits/rejected": -0.846875011920929, "logps/chosen": -395.54998779296875, "logps/rejected": -481.70001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.610766649246216, "rewards/margins": 10.100000381469727, "rewards/rejected": -12.715624809265137, "step": 7260 }, { "epoch": 3.8323668950975223, "grad_norm": 3.983857384376989, "learning_rate": 4.2040063257775434e-08, "logits/chosen": -0.3502563536167145, "logits/rejected": -0.703906238079071, "logps/chosen": -388.8500061035156, "logps/rejected": -441.20001220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.834375023841858, "rewards/margins": 9.921875, "rewards/rejected": -11.756250381469727, "step": 7270 }, { "epoch": 3.837638376383764, "grad_norm": 0.4388598321970343, "learning_rate": 4.0722192936215075e-08, "logits/chosen": -0.43162840604782104, "logits/rejected": -0.7490234375, "logps/chosen": -413.20001220703125, "logps/rejected": -469.8500061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.484570264816284, "rewards/margins": 10.03125, "rewards/rejected": -12.521875381469727, "step": 7280 }, { "epoch": 3.8429098576700054, "grad_norm": 0.20443960528317578, "learning_rate": 3.9404322614654716e-08, "logits/chosen": -0.532153308391571, "logits/rejected": -0.7677246332168579, "logps/chosen": -386.5, "logps/rejected": -464.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.386767625808716, "rewards/margins": 9.859375, "rewards/rejected": -12.243749618530273, "step": 7290 }, { "epoch": 3.8481813389562465, "grad_norm": 2.347999781466845, "learning_rate": 3.808645229309436e-08, "logits/chosen": -0.398101806640625, "logits/rejected": -0.811816394329071, "logps/chosen": -402.25, "logps/rejected": -473.1000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.037402391433716, "rewards/margins": 10.240625381469727, "rewards/rejected": -12.278124809265137, "step": 7300 }, { "epoch": 3.853452820242488, "grad_norm": 0.40790695125882853, "learning_rate": 3.6768581971534e-08, "logits/chosen": -0.3311096131801605, "logits/rejected": -0.633105456829071, "logps/chosen": -379.6000061035156, "logps/rejected": -486.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.48388671875, "rewards/margins": 10.353124618530273, "rewards/rejected": -12.837499618530273, "step": 7310 }, { "epoch": 3.8587243015287296, "grad_norm": 3.141440158152639, "learning_rate": 3.545071164997364e-08, "logits/chosen": -0.4776977598667145, "logits/rejected": -0.74169921875, "logps/chosen": -393.1000061035156, "logps/rejected": -466.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.280078172683716, "rewards/margins": 9.918749809265137, "rewards/rejected": -12.206250190734863, "step": 7320 }, { "epoch": 3.863995782814971, "grad_norm": 0.9384598006280905, "learning_rate": 3.4132841328413286e-08, "logits/chosen": -0.560302734375, "logits/rejected": -0.75341796875, "logps/chosen": -395.1000061035156, "logps/rejected": -490.5, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.067187547683716, "rewards/margins": 9.824999809265137, "rewards/rejected": -11.890625, "step": 7330 }, { "epoch": 3.8692672641012127, "grad_norm": 3.512568659931531, "learning_rate": 3.281497100685293e-08, "logits/chosen": -0.30424803495407104, "logits/rejected": -0.734790027141571, "logps/chosen": -400.95001220703125, "logps/rejected": -465.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.090283155441284, "rewards/margins": 10.128125190734863, "rewards/rejected": -12.212499618530273, "step": 7340 }, { "epoch": 3.874538745387454, "grad_norm": 1.1137075948688457, "learning_rate": 3.149710068529256e-08, "logits/chosen": -0.3084350526332855, "logits/rejected": -0.6802734136581421, "logps/chosen": -348.29998779296875, "logps/rejected": -427.8999938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.0443358421325684, "rewards/margins": 9.884374618530273, "rewards/rejected": -11.915624618530273, "step": 7350 }, { "epoch": 3.8798102266736954, "grad_norm": 0.5958259407019647, "learning_rate": 3.017923036373221e-08, "logits/chosen": -0.41771239042282104, "logits/rejected": -0.705859363079071, "logps/chosen": -414.8500061035156, "logps/rejected": -463.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.14453125, "rewards/margins": 9.571874618530273, "rewards/rejected": -11.712499618530273, "step": 7360 }, { "epoch": 3.885081707959937, "grad_norm": 5.647255566659271, "learning_rate": 2.886136004217185e-08, "logits/chosen": -0.3224731385707855, "logits/rejected": -0.619641125202179, "logps/chosen": -381.75, "logps/rejected": -463.1000061035156, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.1712889671325684, "rewards/margins": 9.628125190734863, "rewards/rejected": -11.803125381469727, "step": 7370 }, { "epoch": 3.890353189246178, "grad_norm": 2.2099392269443077, "learning_rate": 2.754348972061149e-08, "logits/chosen": -0.41669923067092896, "logits/rejected": -0.69921875, "logps/chosen": -405.6000061035156, "logps/rejected": -457.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.783935546875, "rewards/margins": 10.078125, "rewards/rejected": -11.868749618530273, "step": 7380 }, { "epoch": 3.8956246705324196, "grad_norm": 3.736914829156437, "learning_rate": 2.6225619399051132e-08, "logits/chosen": -0.4591918885707855, "logits/rejected": -0.6561279296875, "logps/chosen": -371.79998779296875, "logps/rejected": -451.6000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.2802734375, "rewards/margins": 9.350000381469727, "rewards/rejected": -11.628125190734863, "step": 7390 }, { "epoch": 3.900896151818661, "grad_norm": 0.22344249545507444, "learning_rate": 2.4907749077490773e-08, "logits/chosen": -0.5944000482559204, "logits/rejected": -0.754345715045929, "logps/chosen": -413.0, "logps/rejected": -454.79998779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.7396972179412842, "rewards/margins": 10.268750190734863, "rewards/rejected": -12.009374618530273, "step": 7400 }, { "epoch": 3.9061676331049027, "grad_norm": 0.48430356709042294, "learning_rate": 2.3589878755930417e-08, "logits/chosen": -0.3170410096645355, "logits/rejected": -0.696533203125, "logps/chosen": -444.4750061035156, "logps/rejected": -464.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.9145996570587158, "rewards/margins": 9.957812309265137, "rewards/rejected": -11.878125190734863, "step": 7410 }, { "epoch": 3.911439114391144, "grad_norm": 0.07889915649567718, "learning_rate": 2.2272008434370054e-08, "logits/chosen": -0.3852218687534332, "logits/rejected": -0.6841796636581421, "logps/chosen": -405.6000061035156, "logps/rejected": -471.6000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.087890625, "rewards/margins": 10.009374618530273, "rewards/rejected": -12.106249809265137, "step": 7420 }, { "epoch": 3.9167105956773853, "grad_norm": 0.9417059834869911, "learning_rate": 2.09541381128097e-08, "logits/chosen": -0.39653319120407104, "logits/rejected": -0.807324230670929, "logps/chosen": -385.0, "logps/rejected": -445.3999938964844, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.453906297683716, "rewards/margins": 9.528124809265137, "rewards/rejected": -11.981249809265137, "step": 7430 }, { "epoch": 3.921982076963627, "grad_norm": 12.53579331835041, "learning_rate": 1.9636267791249343e-08, "logits/chosen": -0.579052746295929, "logits/rejected": -0.733154296875, "logps/chosen": -373.54998779296875, "logps/rejected": -444.5, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.2616209983825684, "rewards/margins": 9.801562309265137, "rewards/rejected": -12.059374809265137, "step": 7440 }, { "epoch": 3.927253558249868, "grad_norm": 0.31487610146450296, "learning_rate": 1.831839746968898e-08, "logits/chosen": -0.5538574457168579, "logits/rejected": -0.826416015625, "logps/chosen": -397.45001220703125, "logps/rejected": -452.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.9255859851837158, "rewards/margins": 9.673437118530273, "rewards/rejected": -11.603124618530273, "step": 7450 }, { "epoch": 3.9325250395361095, "grad_norm": 0.16091157508267598, "learning_rate": 1.7000527148128625e-08, "logits/chosen": -0.4328247010707855, "logits/rejected": -0.6698242425918579, "logps/chosen": -400.0249938964844, "logps/rejected": -468.75, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.2455077171325684, "rewards/margins": 9.740625381469727, "rewards/rejected": -11.987500190734863, "step": 7460 }, { "epoch": 3.937796520822351, "grad_norm": 0.14130706361101605, "learning_rate": 1.5682656826568266e-08, "logits/chosen": -0.49785155057907104, "logits/rejected": -0.6570190191268921, "logps/chosen": -409.70001220703125, "logps/rejected": -493.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.93994140625, "rewards/margins": 10.717187881469727, "rewards/rejected": -12.668749809265137, "step": 7470 }, { "epoch": 3.9430680021085927, "grad_norm": 0.45730880441725835, "learning_rate": 1.4364786505007907e-08, "logits/chosen": -0.28300780057907104, "logits/rejected": -0.49226075410842896, "logps/chosen": -342.5249938964844, "logps/rejected": -481.79998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.0365967750549316, "rewards/margins": 10.074999809265137, "rewards/rejected": -12.115625381469727, "step": 7480 }, { "epoch": 3.948339483394834, "grad_norm": 0.47121945580840136, "learning_rate": 1.304691618344755e-08, "logits/chosen": -0.389007568359375, "logits/rejected": -0.771191418170929, "logps/chosen": -408.1000061035156, "logps/rejected": -463.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.8737761974334717, "rewards/margins": 9.884374618530273, "rewards/rejected": -11.762499809265137, "step": 7490 }, { "epoch": 3.9536109646810753, "grad_norm": 1.6338128302023696, "learning_rate": 1.172904586188719e-08, "logits/chosen": -0.7134765386581421, "logits/rejected": -0.854736328125, "logps/chosen": -363.1000061035156, "logps/rejected": -423.3999938964844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.673828125, "rewards/margins": 9.493749618530273, "rewards/rejected": -12.162500381469727, "step": 7500 }, { "epoch": 3.958882445967317, "grad_norm": 0.3230840812012249, "learning_rate": 1.0411175540326831e-08, "logits/chosen": -0.549072265625, "logits/rejected": -0.8006347417831421, "logps/chosen": -416.20001220703125, "logps/rejected": -467.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.066021680831909, "rewards/margins": 9.774999618530273, "rewards/rejected": -11.84375, "step": 7510 }, { "epoch": 3.964153927253558, "grad_norm": 0.6397000589505722, "learning_rate": 9.093305218766472e-09, "logits/chosen": -0.2571777403354645, "logits/rejected": -0.603271484375, "logps/chosen": -397.95001220703125, "logps/rejected": -474.3999938964844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.119903564453125, "rewards/margins": 9.90625, "rewards/rejected": -12.021875381469727, "step": 7520 }, { "epoch": 3.9694254085397995, "grad_norm": 4.980555462280309, "learning_rate": 7.775434897206115e-09, "logits/chosen": -0.6501106023788452, "logits/rejected": -0.845703125, "logps/chosen": -364.5, "logps/rejected": -447.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.033203125, "rewards/margins": 9.987500190734863, "rewards/rejected": -12.024999618530273, "step": 7530 }, { "epoch": 3.974696889826041, "grad_norm": 0.3010145300927002, "learning_rate": 6.457564575645756e-09, "logits/chosen": -0.5448395013809204, "logits/rejected": -0.6832031011581421, "logps/chosen": -385.1499938964844, "logps/rejected": -443.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.074414014816284, "rewards/margins": 9.560937881469727, "rewards/rejected": -11.631250381469727, "step": 7540 }, { "epoch": 3.9799683711122826, "grad_norm": 0.8085970252912793, "learning_rate": 5.139694254085398e-09, "logits/chosen": -0.4872680604457855, "logits/rejected": -0.904101550579071, "logps/chosen": -392.29998779296875, "logps/rejected": -444.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.380859375, "rewards/margins": 9.771875381469727, "rewards/rejected": -12.153124809265137, "step": 7550 }, { "epoch": 3.985239852398524, "grad_norm": 0.530158579335915, "learning_rate": 3.82182393252504e-09, "logits/chosen": -0.37261962890625, "logits/rejected": -0.6993652582168579, "logps/chosen": -384.1000061035156, "logps/rejected": -462.20001220703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.920312523841858, "rewards/margins": 9.407812118530273, "rewards/rejected": -11.328125, "step": 7560 }, { "epoch": 3.9905113336847653, "grad_norm": 1.1428287795923635, "learning_rate": 2.5039536109646808e-09, "logits/chosen": -0.6201416254043579, "logits/rejected": -0.8602539300918579, "logps/chosen": -440.20001220703125, "logps/rejected": -480.3999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.08984375, "rewards/margins": 10.239062309265137, "rewards/rejected": -12.318750381469727, "step": 7570 }, { "epoch": 3.995782814971007, "grad_norm": 3.4074325312909868, "learning_rate": 1.1860832894043225e-09, "logits/chosen": -0.42866212129592896, "logits/rejected": -0.6150451898574829, "logps/chosen": -414.20001220703125, "logps/rejected": -507.5, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.343554735183716, "rewards/margins": 10.110937118530273, "rewards/rejected": -12.459375381469727, "step": 7580 } ], "logging_steps": 10, "max_steps": 7588, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }