{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 15176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002635740643120717, "grad_norm": 142.80002298844403, "learning_rate": 9.994069583552977e-07, "logits/chosen": 0.8333984613418579, "logits/rejected": 0.90576171875, "logps/chosen": -390.3500061035156, "logps/rejected": -340.0, "loss": 0.7302, "rewards/accuracies": 0.375, "rewards/chosen": -0.36738282442092896, "rewards/margins": -0.03417816013097763, "rewards/rejected": -0.3330078125, "step": 10 }, { "epoch": 0.005271481286241434, "grad_norm": 147.09559545068782, "learning_rate": 9.987480231945176e-07, "logits/chosen": 0.89599609375, "logits/rejected": 0.9306640625, "logps/chosen": -384.75, "logps/rejected": -333.29998779296875, "loss": 0.6828, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.11380615085363388, "rewards/margins": 0.03700561448931694, "rewards/rejected": -0.1506500244140625, "step": 20 }, { "epoch": 0.00790722192936215, "grad_norm": 114.74754198299179, "learning_rate": 9.980890880337376e-07, "logits/chosen": 0.943554699420929, "logits/rejected": 1.03173828125, "logps/chosen": -368.29998779296875, "logps/rejected": -328.6000061035156, "loss": 0.6284, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3600097596645355, "rewards/margins": 0.21378174424171448, "rewards/rejected": 0.146453857421875, "step": 30 }, { "epoch": 0.010542962572482868, "grad_norm": 175.79913370127386, "learning_rate": 9.974301528729573e-07, "logits/chosen": 0.9200195074081421, "logits/rejected": 0.894335925579071, "logps/chosen": -362.70001220703125, "logps/rejected": -326.6499938964844, "loss": 0.6864, "rewards/accuracies": 0.53125, "rewards/chosen": 0.12729644775390625, "rewards/margins": 0.14492186903953552, "rewards/rejected": -0.01799621619284153, "step": 40 }, { "epoch": 0.013178703215603585, "grad_norm": 213.51422256550887, "learning_rate": 9.96771217712177e-07, "logits/chosen": 0.7676757574081421, "logits/rejected": 0.7172104120254517, "logps/chosen": -389.1499938964844, "logps/rejected": -413.70001220703125, "loss": 0.6603, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20021972060203552, "rewards/margins": 0.284210205078125, "rewards/rejected": -0.48457032442092896, "step": 50 }, { "epoch": 0.0158144438587243, "grad_norm": 104.88860311633891, "learning_rate": 9.961122825513968e-07, "logits/chosen": 0.78759765625, "logits/rejected": 0.719897449016571, "logps/chosen": -353.25, "logps/rejected": -297.3500061035156, "loss": 0.621, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05356445163488388, "rewards/margins": 0.34759521484375, "rewards/rejected": -0.4009033143520355, "step": 60 }, { "epoch": 0.01845018450184502, "grad_norm": 114.99693159858597, "learning_rate": 9.954533473906168e-07, "logits/chosen": 0.789990246295929, "logits/rejected": 0.842968761920929, "logps/chosen": -339.54998779296875, "logps/rejected": -339.3500061035156, "loss": 0.6018, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.30607908964157104, "rewards/margins": 0.3864502012729645, "rewards/rejected": -0.08041229099035263, "step": 70 }, { "epoch": 0.021085925144965736, "grad_norm": 121.03678409052164, "learning_rate": 9.947944122298365e-07, "logits/chosen": 0.8626953363418579, "logits/rejected": 0.7369140386581421, "logps/chosen": -347.95001220703125, "logps/rejected": -331.1499938964844, "loss": 0.6682, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08101501315832138, "rewards/margins": 0.40678709745407104, "rewards/rejected": -0.326171875, "step": 80 }, { "epoch": 0.023721665788086453, "grad_norm": 159.2895521553378, "learning_rate": 9.941354770690563e-07, "logits/chosen": 0.570849597454071, "logits/rejected": 0.5972824096679688, "logps/chosen": -405.04998779296875, "logps/rejected": -346.6000061035156, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11384887993335724, "rewards/margins": 0.42594605684280396, "rewards/rejected": -0.5399856567382812, "step": 90 }, { "epoch": 0.02635740643120717, "grad_norm": 143.491887062669, "learning_rate": 9.934765419082762e-07, "logits/chosen": 0.975878894329071, "logits/rejected": 0.898388683795929, "logps/chosen": -381.20001220703125, "logps/rejected": -347.1499938964844, "loss": 0.5572, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.5149902105331421, "rewards/margins": 0.55059814453125, "rewards/rejected": -0.03634033352136612, "step": 100 }, { "epoch": 0.028993147074327884, "grad_norm": 153.28162961214383, "learning_rate": 9.92817606747496e-07, "logits/chosen": 0.795703113079071, "logits/rejected": 0.820068359375, "logps/chosen": -360.79998779296875, "logps/rejected": -361.3999938964844, "loss": 0.6477, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.5370117425918579, "rewards/margins": 0.42194825410842896, "rewards/rejected": 0.11588744819164276, "step": 110 }, { "epoch": 0.0316288877174486, "grad_norm": 121.57942347174213, "learning_rate": 9.92158671586716e-07, "logits/chosen": 0.789501965045929, "logits/rejected": 0.815380871295929, "logps/chosen": -349.3500061035156, "logps/rejected": -362.54998779296875, "loss": 0.6131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.39960938692092896, "rewards/margins": 0.4585632383823395, "rewards/rejected": -0.05964965745806694, "step": 120 }, { "epoch": 0.03426462836056932, "grad_norm": 119.74574075965474, "learning_rate": 9.914997364259357e-07, "logits/chosen": 0.7103515863418579, "logits/rejected": 0.745898425579071, "logps/chosen": -375.95001220703125, "logps/rejected": -351.70001220703125, "loss": 0.7323, "rewards/accuracies": 0.5625, "rewards/chosen": 0.117919921875, "rewards/margins": 0.213836669921875, "rewards/rejected": -0.09629516303539276, "step": 130 }, { "epoch": 0.03690036900369004, "grad_norm": 137.05588508857235, "learning_rate": 9.908408012651554e-07, "logits/chosen": 0.9239257574081421, "logits/rejected": 0.795703113079071, "logps/chosen": -392.6000061035156, "logps/rejected": -349.25, "loss": 0.6405, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.3499755859375, "rewards/margins": 0.4111328125, "rewards/rejected": -0.061279296875, "step": 140 }, { "epoch": 0.03953610964681075, "grad_norm": 117.44552707664239, "learning_rate": 9.901818661043754e-07, "logits/chosen": 0.7646484375, "logits/rejected": 0.874316394329071, "logps/chosen": -372.20001220703125, "logps/rejected": -353.79998779296875, "loss": 0.6262, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4762939512729645, "rewards/margins": 0.33604127168655396, "rewards/rejected": 0.1412353515625, "step": 150 }, { "epoch": 0.04217185028993147, "grad_norm": 120.2045397328374, "learning_rate": 9.89522930943595e-07, "logits/chosen": 0.7682129144668579, "logits/rejected": 0.82080078125, "logps/chosen": -330.0, "logps/rejected": -315.20001220703125, "loss": 0.5899, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.37034910917282104, "rewards/margins": 0.44886475801467896, "rewards/rejected": -0.07872314751148224, "step": 160 }, { "epoch": 0.044807590933052185, "grad_norm": 178.06002449447763, "learning_rate": 9.888639957828148e-07, "logits/chosen": 0.8521484136581421, "logits/rejected": 0.686083972454071, "logps/chosen": -375.75, "logps/rejected": -341.375, "loss": 0.611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.527636706829071, "rewards/margins": 0.591656506061554, "rewards/rejected": -0.06350097805261612, "step": 170 }, { "epoch": 0.047443331576172906, "grad_norm": 133.0754379388097, "learning_rate": 9.882050606220348e-07, "logits/chosen": 0.7113281488418579, "logits/rejected": 0.7168213129043579, "logps/chosen": -411.1499938964844, "logps/rejected": -389.6499938964844, "loss": 0.5863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.611010730266571, "rewards/margins": 0.640643298625946, "rewards/rejected": -0.02987060509622097, "step": 180 }, { "epoch": 0.05007907221929362, "grad_norm": 126.32537567549112, "learning_rate": 9.875461254612545e-07, "logits/chosen": 0.5790039300918579, "logits/rejected": 0.597674548625946, "logps/chosen": -388.45001220703125, "logps/rejected": -331.0, "loss": 0.5858, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4035400450229645, "rewards/margins": 0.5658813714981079, "rewards/rejected": -0.16220703721046448, "step": 190 }, { "epoch": 0.05271481286241434, "grad_norm": 113.40800126538313, "learning_rate": 9.868871903004745e-07, "logits/chosen": 0.808789074420929, "logits/rejected": 0.714404284954071, "logps/chosen": -384.20001220703125, "logps/rejected": -347.70001220703125, "loss": 0.5988, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5831054449081421, "rewards/margins": 0.577099621295929, "rewards/rejected": 0.0065673827193677425, "step": 200 }, { "epoch": 0.055350553505535055, "grad_norm": 111.13293533187709, "learning_rate": 9.862282551396942e-07, "logits/chosen": 0.7815917730331421, "logits/rejected": 0.7490234375, "logps/chosen": -360.70001220703125, "logps/rejected": -318.45001220703125, "loss": 0.5951, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.891308605670929, "rewards/margins": 0.5536133050918579, "rewards/rejected": 0.33759766817092896, "step": 210 }, { "epoch": 0.05798629414865577, "grad_norm": 96.40949692864356, "learning_rate": 9.85569319978914e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.7552734613418579, "logps/chosen": -345.5, "logps/rejected": -324.70001220703125, "loss": 0.5339, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.846728503704071, "rewards/margins": 0.669787585735321, "rewards/rejected": 0.17659302055835724, "step": 220 }, { "epoch": 0.06062203479177649, "grad_norm": 117.47602124365424, "learning_rate": 9.849103848181337e-07, "logits/chosen": 0.6584228277206421, "logits/rejected": 0.6241699457168579, "logps/chosen": -359.6000061035156, "logps/rejected": -294.04998779296875, "loss": 0.5723, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.47773438692092896, "rewards/margins": 0.6700195074081421, "rewards/rejected": -0.192352294921875, "step": 230 }, { "epoch": 0.0632577754348972, "grad_norm": 150.55241617559702, "learning_rate": 9.842514496573537e-07, "logits/chosen": 0.762860119342804, "logits/rejected": 0.772753894329071, "logps/chosen": -366.1499938964844, "logps/rejected": -360.8999938964844, "loss": 0.6289, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.57843017578125, "rewards/margins": 0.5278564691543579, "rewards/rejected": 0.05092773586511612, "step": 240 }, { "epoch": 0.06589351607801792, "grad_norm": 115.92649640350405, "learning_rate": 9.835925144965736e-07, "logits/chosen": 0.5935195684432983, "logits/rejected": 0.509716808795929, "logps/chosen": -385.0, "logps/rejected": -316.3999938964844, "loss": 0.5879, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.8389037847518921, "rewards/margins": 0.672863781452179, "rewards/rejected": 0.16593018174171448, "step": 250 }, { "epoch": 0.06852925672113865, "grad_norm": 122.05191027011293, "learning_rate": 9.829335793357934e-07, "logits/chosen": 0.7164551019668579, "logits/rejected": 0.6724609136581421, "logps/chosen": -366.5, "logps/rejected": -344.1499938964844, "loss": 0.6442, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.3489990234375, "rewards/margins": 0.5303955078125, "rewards/rejected": -0.18093261122703552, "step": 260 }, { "epoch": 0.07116499736425935, "grad_norm": 102.79806323973028, "learning_rate": 9.822746441750131e-07, "logits/chosen": 0.6172240972518921, "logits/rejected": 0.5808349847793579, "logps/chosen": -410.1499938964844, "logps/rejected": -383.3500061035156, "loss": 0.5707, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4306396543979645, "rewards/margins": 0.7081543207168579, "rewards/rejected": -0.2780090272426605, "step": 270 }, { "epoch": 0.07380073800738007, "grad_norm": 157.19456506496, "learning_rate": 9.816157090142329e-07, "logits/chosen": 0.749560534954071, "logits/rejected": 0.606457531452179, "logps/chosen": -374.79998779296875, "logps/rejected": -318.25, "loss": 0.5957, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.4519287049770355, "rewards/margins": 0.676464855670929, "rewards/rejected": -0.22434082627296448, "step": 280 }, { "epoch": 0.0764364786505008, "grad_norm": 168.89729830063774, "learning_rate": 9.809567738534528e-07, "logits/chosen": 0.71728515625, "logits/rejected": 0.68389892578125, "logps/chosen": -390.8999938964844, "logps/rejected": -357.1000061035156, "loss": 0.5394, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.41143798828125, "rewards/margins": 0.813525378704071, "rewards/rejected": -0.4016357362270355, "step": 290 }, { "epoch": 0.0790722192936215, "grad_norm": 118.09467365376177, "learning_rate": 9.802978386926726e-07, "logits/chosen": 0.683734118938446, "logits/rejected": 0.8016601800918579, "logps/chosen": -372.5, "logps/rejected": -369.1499938964844, "loss": 0.66, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.15899658203125, "rewards/margins": 0.5799316167831421, "rewards/rejected": -0.42110902070999146, "step": 300 }, { "epoch": 0.08170795993674222, "grad_norm": 139.81231364643986, "learning_rate": 9.796389035318923e-07, "logits/chosen": 0.6932617425918579, "logits/rejected": 0.74951171875, "logps/chosen": -352.54998779296875, "logps/rejected": -331.5, "loss": 0.5337, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7614196538925171, "rewards/margins": 0.875732421875, "rewards/rejected": -0.11368713527917862, "step": 310 }, { "epoch": 0.08434370057986294, "grad_norm": 123.80027641164568, "learning_rate": 9.789799683711123e-07, "logits/chosen": 0.7908691167831421, "logits/rejected": 0.77362060546875, "logps/chosen": -369.70001220703125, "logps/rejected": -320.8500061035156, "loss": 0.5002, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.952343761920929, "rewards/margins": 0.83837890625, "rewards/rejected": 0.11444701999425888, "step": 320 }, { "epoch": 0.08697944122298366, "grad_norm": 116.69125182405122, "learning_rate": 9.78321033210332e-07, "logits/chosen": 0.8140624761581421, "logits/rejected": 0.7591797113418579, "logps/chosen": -355.95001220703125, "logps/rejected": -303.70001220703125, "loss": 0.5718, "rewards/accuracies": 0.6875, "rewards/chosen": 0.896923840045929, "rewards/margins": 0.7447265386581421, "rewards/rejected": 0.15214844048023224, "step": 330 }, { "epoch": 0.08961518186610437, "grad_norm": 90.8010539031064, "learning_rate": 9.77662098049552e-07, "logits/chosen": 0.719738781452179, "logits/rejected": 0.711230456829071, "logps/chosen": -335.8999938964844, "logps/rejected": -311.7250061035156, "loss": 0.5992, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7264648675918579, "rewards/margins": 0.749011218547821, "rewards/rejected": -0.0223388671875, "step": 340 }, { "epoch": 0.09225092250922509, "grad_norm": 125.36063692679598, "learning_rate": 9.770031628887717e-07, "logits/chosen": 0.7929229736328125, "logits/rejected": 0.7035888433456421, "logps/chosen": -430.1499938964844, "logps/rejected": -398.1000061035156, "loss": 0.6448, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.5741211175918579, "rewards/margins": 0.6505126953125, "rewards/rejected": -0.07601318508386612, "step": 350 }, { "epoch": 0.09488666315234581, "grad_norm": 103.02419170538661, "learning_rate": 9.763442277279915e-07, "logits/chosen": 0.760449230670929, "logits/rejected": 0.7691406011581421, "logps/chosen": -427.8999938964844, "logps/rejected": -339.25, "loss": 0.5867, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.7510741949081421, "rewards/margins": 0.819506824016571, "rewards/rejected": -0.06887207180261612, "step": 360 }, { "epoch": 0.09752240379546652, "grad_norm": 108.4435569714711, "learning_rate": 9.756852925672114e-07, "logits/chosen": 0.925952136516571, "logits/rejected": 0.872265636920929, "logps/chosen": -356.54998779296875, "logps/rejected": -310.5, "loss": 0.5586, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.034277319908142, "rewards/margins": 0.810791015625, "rewards/rejected": 0.22343139350414276, "step": 370 }, { "epoch": 0.10015814443858724, "grad_norm": 169.52583090617728, "learning_rate": 9.750263574064312e-07, "logits/chosen": 0.8651367425918579, "logits/rejected": 0.785205066204071, "logps/chosen": -346.1499938964844, "logps/rejected": -336.8999938964844, "loss": 0.625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.804736316204071, "rewards/margins": 0.6447509527206421, "rewards/rejected": 0.16058960556983948, "step": 380 }, { "epoch": 0.10279388508170796, "grad_norm": 124.9348806221308, "learning_rate": 9.74367422245651e-07, "logits/chosen": 0.87353515625, "logits/rejected": 0.9066406488418579, "logps/chosen": -389.20001220703125, "logps/rejected": -365.1000061035156, "loss": 0.6164, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7646423578262329, "rewards/margins": 0.6929931640625, "rewards/rejected": 0.07298584282398224, "step": 390 }, { "epoch": 0.10542962572482868, "grad_norm": 148.21113086190914, "learning_rate": 9.737084870848709e-07, "logits/chosen": 0.6572021245956421, "logits/rejected": 0.711865246295929, "logps/chosen": -365.04998779296875, "logps/rejected": -322.20001220703125, "loss": 0.5913, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.7607421875, "rewards/margins": 0.676928699016571, "rewards/rejected": 0.08380126953125, "step": 400 }, { "epoch": 0.10806536636794939, "grad_norm": 103.68211887287947, "learning_rate": 9.730495519240906e-07, "logits/chosen": 0.6793457269668579, "logits/rejected": 0.5867675542831421, "logps/chosen": -380.95001220703125, "logps/rejected": -355.54998779296875, "loss": 0.5349, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.613696277141571, "rewards/margins": 0.828369140625, "rewards/rejected": -0.21380615234375, "step": 410 }, { "epoch": 0.11070110701107011, "grad_norm": 76.10877128159464, "learning_rate": 9.723906167633106e-07, "logits/chosen": 0.807568371295929, "logits/rejected": 0.8403472900390625, "logps/chosen": -385.0, "logps/rejected": -332.8999938964844, "loss": 0.5322, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.579296886920929, "rewards/margins": 0.802294909954071, "rewards/rejected": -0.22318725287914276, "step": 420 }, { "epoch": 0.11333684765419083, "grad_norm": 121.66075977041314, "learning_rate": 9.717316816025303e-07, "logits/chosen": 0.639599621295929, "logits/rejected": 0.5549103021621704, "logps/chosen": -337.8500061035156, "logps/rejected": -345.25, "loss": 0.5627, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3372802734375, "rewards/margins": 0.758984386920929, "rewards/rejected": -0.42119139432907104, "step": 430 }, { "epoch": 0.11597258829731154, "grad_norm": 171.52135614991582, "learning_rate": 9.7107274644175e-07, "logits/chosen": 0.6080566644668579, "logits/rejected": 0.638763427734375, "logps/chosen": -329.95001220703125, "logps/rejected": -316.5, "loss": 0.6536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.33367919921875, "rewards/margins": 0.5439453125, "rewards/rejected": -0.2099609375, "step": 440 }, { "epoch": 0.11860832894043226, "grad_norm": 103.74464322060084, "learning_rate": 9.704138112809698e-07, "logits/chosen": 0.5243164300918579, "logits/rejected": 0.6053711175918579, "logps/chosen": -347.0, "logps/rejected": -348.0, "loss": 0.6528, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.3625732362270355, "rewards/margins": 0.607006847858429, "rewards/rejected": -0.24469299614429474, "step": 450 }, { "epoch": 0.12124406958355298, "grad_norm": 186.34910930261125, "learning_rate": 9.697548761201898e-07, "logits/chosen": 0.664306640625, "logits/rejected": 0.5854431390762329, "logps/chosen": -336.79998779296875, "logps/rejected": -329.45001220703125, "loss": 0.6138, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.4293151795864105, "rewards/margins": 0.7220703363418579, "rewards/rejected": -0.29265135526657104, "step": 460 }, { "epoch": 0.1238798102266737, "grad_norm": 120.09301381068823, "learning_rate": 9.690959409594095e-07, "logits/chosen": 0.6333984136581421, "logits/rejected": 0.62890625, "logps/chosen": -343.875, "logps/rejected": -355.8500061035156, "loss": 0.5813, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.705822765827179, "rewards/margins": 0.818896472454071, "rewards/rejected": -0.11328125, "step": 470 }, { "epoch": 0.1265155508697944, "grad_norm": 143.94423004102825, "learning_rate": 9.684370057986295e-07, "logits/chosen": 0.6208251714706421, "logits/rejected": 0.44853514432907104, "logps/chosen": -409.8999938964844, "logps/rejected": -364.70001220703125, "loss": 0.5936, "rewards/accuracies": 0.65625, "rewards/chosen": 0.49836426973342896, "rewards/margins": 0.8530517816543579, "rewards/rejected": -0.35535889863967896, "step": 480 }, { "epoch": 0.12915129151291513, "grad_norm": 159.73041495633126, "learning_rate": 9.677780706378492e-07, "logits/chosen": 0.6013427972793579, "logits/rejected": 0.6126708984375, "logps/chosen": -384.1499938964844, "logps/rejected": -370.04998779296875, "loss": 0.5647, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.6612304449081421, "rewards/margins": 0.745898425579071, "rewards/rejected": -0.08533935248851776, "step": 490 }, { "epoch": 0.13178703215603585, "grad_norm": 119.47640638835232, "learning_rate": 9.67119135477069e-07, "logits/chosen": 0.7598632574081421, "logits/rejected": 0.7098633050918579, "logps/chosen": -340.95001220703125, "logps/rejected": -356.1499938964844, "loss": 0.6047, "rewards/accuracies": 0.71875, "rewards/chosen": 0.970703125, "rewards/margins": 0.8202148675918579, "rewards/rejected": 0.15085449814796448, "step": 500 }, { "epoch": 0.13442277279915657, "grad_norm": 142.53736909743617, "learning_rate": 9.66460200316289e-07, "logits/chosen": 0.76025390625, "logits/rejected": 0.7706054449081421, "logps/chosen": -341.3500061035156, "logps/rejected": -308.125, "loss": 0.5856, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8875976800918579, "rewards/margins": 0.7395263910293579, "rewards/rejected": 0.14780578017234802, "step": 510 }, { "epoch": 0.1370585134422773, "grad_norm": 113.44305745782874, "learning_rate": 9.658012651555086e-07, "logits/chosen": 0.512316882610321, "logits/rejected": 0.6009277105331421, "logps/chosen": -342.04998779296875, "logps/rejected": -332.1499938964844, "loss": 0.5396, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7674316167831421, "rewards/margins": 1.019628882408142, "rewards/rejected": -0.2523025572299957, "step": 520 }, { "epoch": 0.13969425408539798, "grad_norm": 109.63653411813588, "learning_rate": 9.651423299947284e-07, "logits/chosen": 0.522045910358429, "logits/rejected": 0.4627685546875, "logps/chosen": -366.70001220703125, "logps/rejected": -339.04998779296875, "loss": 0.6977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12106017768383026, "rewards/margins": 0.655078113079071, "rewards/rejected": -0.533984363079071, "step": 530 }, { "epoch": 0.1423299947285187, "grad_norm": 139.12953549177024, "learning_rate": 9.644833948339483e-07, "logits/chosen": 0.534912109375, "logits/rejected": 0.5266479253768921, "logps/chosen": -358.1499938964844, "logps/rejected": -340.6000061035156, "loss": 0.6156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.42835694551467896, "rewards/margins": 0.7844482660293579, "rewards/rejected": -0.3559204041957855, "step": 540 }, { "epoch": 0.14496573537163943, "grad_norm": 121.49168833508993, "learning_rate": 9.63824459673168e-07, "logits/chosen": 0.6580230593681335, "logits/rejected": 0.752246081829071, "logps/chosen": -386.20001220703125, "logps/rejected": -348.1000061035156, "loss": 0.4963, "rewards/accuracies": 0.75, "rewards/chosen": 1.0691406726837158, "rewards/margins": 1.023779273033142, "rewards/rejected": 0.04560546949505806, "step": 550 }, { "epoch": 0.14760147601476015, "grad_norm": 160.702062995484, "learning_rate": 9.63165524512388e-07, "logits/chosen": 0.565136730670929, "logits/rejected": 0.590893566608429, "logps/chosen": -410.6499938964844, "logps/rejected": -366.6000061035156, "loss": 0.5596, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8327270746231079, "rewards/margins": 0.947998046875, "rewards/rejected": -0.11557617038488388, "step": 560 }, { "epoch": 0.15023721665788087, "grad_norm": 159.31735607219224, "learning_rate": 9.625065893516078e-07, "logits/chosen": 0.761035144329071, "logits/rejected": 0.676464855670929, "logps/chosen": -373.0, "logps/rejected": -352.79998779296875, "loss": 0.5714, "rewards/accuracies": 0.6875, "rewards/chosen": 0.928662121295929, "rewards/margins": 0.9736328125, "rewards/rejected": -0.04429931566119194, "step": 570 }, { "epoch": 0.1528729573010016, "grad_norm": 92.33599549365213, "learning_rate": 9.618476541908275e-07, "logits/chosen": 0.7643066644668579, "logits/rejected": 0.6423095464706421, "logps/chosen": -293.1000061035156, "logps/rejected": -273.0, "loss": 0.6097, "rewards/accuracies": 0.65625, "rewards/chosen": 0.762499988079071, "rewards/margins": 0.7307983636856079, "rewards/rejected": 0.03270263597369194, "step": 580 }, { "epoch": 0.1555086979441223, "grad_norm": 99.89049006679555, "learning_rate": 9.611887190300475e-07, "logits/chosen": 0.667187511920929, "logits/rejected": 0.6672729253768921, "logps/chosen": -330.79998779296875, "logps/rejected": -316.1499938964844, "loss": 0.5982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.720166027545929, "rewards/margins": 0.79156494140625, "rewards/rejected": -0.07243652641773224, "step": 590 }, { "epoch": 0.158144438587243, "grad_norm": 137.23017960689012, "learning_rate": 9.605297838692672e-07, "logits/chosen": 0.576953113079071, "logits/rejected": 0.5013672113418579, "logps/chosen": -336.6499938964844, "logps/rejected": -325.1000061035156, "loss": 0.6515, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5345214605331421, "rewards/margins": 0.712902843952179, "rewards/rejected": -0.17729492485523224, "step": 600 }, { "epoch": 0.16078017923036372, "grad_norm": 179.36411163038233, "learning_rate": 9.59870848708487e-07, "logits/chosen": 0.5728515386581421, "logits/rejected": 0.631103515625, "logps/chosen": -338.79998779296875, "logps/rejected": -330.1499938964844, "loss": 0.6776, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.42390137910842896, "rewards/margins": 0.5553954839706421, "rewards/rejected": -0.13115234673023224, "step": 610 }, { "epoch": 0.16341591987348444, "grad_norm": 120.54454032249754, "learning_rate": 9.592119135477067e-07, "logits/chosen": 0.700305163860321, "logits/rejected": 0.7689453363418579, "logps/chosen": -382.3999938964844, "logps/rejected": -347.2250061035156, "loss": 0.6424, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4169677793979645, "rewards/margins": 0.653759777545929, "rewards/rejected": -0.23603515326976776, "step": 620 }, { "epoch": 0.16605166051660517, "grad_norm": 143.78326691089205, "learning_rate": 9.585529783869267e-07, "logits/chosen": 0.556323230266571, "logits/rejected": 0.6026855707168579, "logps/chosen": -378.79998779296875, "logps/rejected": -389.20001220703125, "loss": 0.5586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6866210699081421, "rewards/margins": 0.924511730670929, "rewards/rejected": -0.23757323622703552, "step": 630 }, { "epoch": 0.16868740115972589, "grad_norm": 131.60476970693074, "learning_rate": 9.578940432261464e-07, "logits/chosen": 0.8165038824081421, "logits/rejected": 0.8218749761581421, "logps/chosen": -354.5, "logps/rejected": -330.79998779296875, "loss": 0.6451, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.544677734375, "rewards/margins": 0.651318371295929, "rewards/rejected": -0.1065673828125, "step": 640 }, { "epoch": 0.1713231418028466, "grad_norm": 123.36936097452622, "learning_rate": 9.572351080653664e-07, "logits/chosen": 0.77197265625, "logits/rejected": 0.7062011957168579, "logps/chosen": -379.8500061035156, "logps/rejected": -358.29998779296875, "loss": 0.6178, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6860290765762329, "rewards/margins": 0.752941906452179, "rewards/rejected": -0.06672058254480362, "step": 650 }, { "epoch": 0.17395888244596733, "grad_norm": 102.8908979604312, "learning_rate": 9.565761729045861e-07, "logits/chosen": 0.685742199420929, "logits/rejected": 0.6841796636581421, "logps/chosen": -339.6499938964844, "logps/rejected": -318.625, "loss": 0.6548, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.729785144329071, "rewards/margins": 0.45867919921875, "rewards/rejected": 0.2715393006801605, "step": 660 }, { "epoch": 0.17659462308908802, "grad_norm": 122.3428313678326, "learning_rate": 9.559172377438059e-07, "logits/chosen": 0.731249988079071, "logits/rejected": 0.721972644329071, "logps/chosen": -406.29998779296875, "logps/rejected": -362.20001220703125, "loss": 0.6065, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.7769531011581421, "rewards/margins": 0.73529052734375, "rewards/rejected": 0.040618896484375, "step": 670 }, { "epoch": 0.17923036373220874, "grad_norm": 98.8000912981556, "learning_rate": 9.552583025830258e-07, "logits/chosen": 0.7005859613418579, "logits/rejected": 0.622753918170929, "logps/chosen": -359.1499938964844, "logps/rejected": -310.0, "loss": 0.521, "rewards/accuracies": 0.71875, "rewards/chosen": 0.661865234375, "rewards/margins": 1.1085937023162842, "rewards/rejected": -0.44648438692092896, "step": 680 }, { "epoch": 0.18186610437532946, "grad_norm": 123.22781859427744, "learning_rate": 9.545993674222456e-07, "logits/chosen": 0.666259765625, "logits/rejected": 0.560717761516571, "logps/chosen": -334.29998779296875, "logps/rejected": -302.79998779296875, "loss": 0.6166, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2660888731479645, "rewards/margins": 0.702014148235321, "rewards/rejected": -0.43647462129592896, "step": 690 }, { "epoch": 0.18450184501845018, "grad_norm": 126.01393089541295, "learning_rate": 9.539404322614655e-07, "logits/chosen": 0.568103015422821, "logits/rejected": 0.5802978277206421, "logps/chosen": -369.70001220703125, "logps/rejected": -355.54998779296875, "loss": 0.6461, "rewards/accuracies": 0.6875, "rewards/chosen": 0.606884777545929, "rewards/margins": 0.701611340045929, "rewards/rejected": -0.09531860053539276, "step": 700 }, { "epoch": 0.1871375856615709, "grad_norm": 138.37574125381647, "learning_rate": 9.532814971006853e-07, "logits/chosen": 0.84521484375, "logits/rejected": 0.748291015625, "logps/chosen": -385.3500061035156, "logps/rejected": -367.6499938964844, "loss": 0.6467, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.592333972454071, "rewards/margins": 0.6700195074081421, "rewards/rejected": -0.07759399712085724, "step": 710 }, { "epoch": 0.18977332630469163, "grad_norm": 98.44293424971933, "learning_rate": 9.526225619399051e-07, "logits/chosen": 0.7597900629043579, "logits/rejected": 0.550488293170929, "logps/chosen": -388.20001220703125, "logps/rejected": -361.0, "loss": 0.585, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8799804449081421, "rewards/margins": 0.789257824420929, "rewards/rejected": 0.09089355170726776, "step": 720 }, { "epoch": 0.19240906694781235, "grad_norm": 141.09442962683667, "learning_rate": 9.519636267791249e-07, "logits/chosen": 0.7782226800918579, "logits/rejected": 0.7811523675918579, "logps/chosen": -318.5, "logps/rejected": -316.70001220703125, "loss": 0.607, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8485351800918579, "rewards/margins": 0.8671875, "rewards/rejected": -0.01912841759622097, "step": 730 }, { "epoch": 0.19504480759093304, "grad_norm": 147.8815128950205, "learning_rate": 9.513046916183447e-07, "logits/chosen": 0.842846691608429, "logits/rejected": 0.7627929449081421, "logps/chosen": -392.3999938964844, "logps/rejected": -335.8999938964844, "loss": 0.4959, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.8548583984375, "rewards/margins": 1.02734375, "rewards/rejected": -0.17246094346046448, "step": 740 }, { "epoch": 0.19768054823405376, "grad_norm": 163.63360505399353, "learning_rate": 9.506457564575645e-07, "logits/chosen": 0.599261462688446, "logits/rejected": 0.63494873046875, "logps/chosen": -356.20001220703125, "logps/rejected": -302.3999938964844, "loss": 0.5439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.769726574420929, "rewards/margins": 0.895263671875, "rewards/rejected": -0.12556762993335724, "step": 750 }, { "epoch": 0.20031628887717448, "grad_norm": 110.09308376793321, "learning_rate": 9.499868212967843e-07, "logits/chosen": 0.752148449420929, "logits/rejected": 0.7170165777206421, "logps/chosen": -365.3500061035156, "logps/rejected": -325.95001220703125, "loss": 0.4791, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.763232409954071, "rewards/margins": 1.199853539466858, "rewards/rejected": -0.43684083223342896, "step": 760 }, { "epoch": 0.2029520295202952, "grad_norm": 110.61108084006821, "learning_rate": 9.493278861360042e-07, "logits/chosen": 0.666796863079071, "logits/rejected": 0.740234375, "logps/chosen": -310.0, "logps/rejected": -340.54998779296875, "loss": 0.6656, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.5832885503768921, "rewards/margins": 0.80035400390625, "rewards/rejected": -0.2169189453125, "step": 770 }, { "epoch": 0.20558777016341592, "grad_norm": 153.14474639473198, "learning_rate": 9.48668950975224e-07, "logits/chosen": 0.49714964628219604, "logits/rejected": 0.5010010004043579, "logps/chosen": -411.20001220703125, "logps/rejected": -393.45001220703125, "loss": 0.5862, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3808837831020355, "rewards/margins": 0.836865246295929, "rewards/rejected": -0.45698851346969604, "step": 780 }, { "epoch": 0.20822351080653664, "grad_norm": 134.56659878765083, "learning_rate": 9.480100158144439e-07, "logits/chosen": 0.5835937261581421, "logits/rejected": 0.3881591856479645, "logps/chosen": -379.3999938964844, "logps/rejected": -345.25, "loss": 0.6394, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.07694091647863388, "rewards/margins": 0.7923583984375, "rewards/rejected": -0.7147461175918579, "step": 790 }, { "epoch": 0.21085925144965736, "grad_norm": 137.77701964083064, "learning_rate": 9.473510806536636e-07, "logits/chosen": 0.3725952208042145, "logits/rejected": 0.4303222596645355, "logps/chosen": -329.375, "logps/rejected": -322.95001220703125, "loss": 0.6185, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03563232347369194, "rewards/margins": 0.849682629108429, "rewards/rejected": -0.8135436773300171, "step": 800 }, { "epoch": 0.21349499209277806, "grad_norm": 131.01678526754063, "learning_rate": 9.466921454928835e-07, "logits/chosen": 0.698138415813446, "logits/rejected": 0.710009753704071, "logps/chosen": -413.1499938964844, "logps/rejected": -374.0, "loss": 0.5999, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3143371641635895, "rewards/margins": 0.8194824457168579, "rewards/rejected": -0.5044647455215454, "step": 810 }, { "epoch": 0.21613073273589878, "grad_norm": 152.60453253027055, "learning_rate": 9.460332103321033e-07, "logits/chosen": 0.7003418207168579, "logits/rejected": 0.6539551019668579, "logps/chosen": -390.54998779296875, "logps/rejected": -372.20001220703125, "loss": 0.5336, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.673999011516571, "rewards/margins": 0.921093761920929, "rewards/rejected": -0.24705810844898224, "step": 820 }, { "epoch": 0.2187664733790195, "grad_norm": 117.07726273780258, "learning_rate": 9.45374275171323e-07, "logits/chosen": 0.855175793170929, "logits/rejected": 0.785400390625, "logps/chosen": -382.25, "logps/rejected": -347.3999938964844, "loss": 0.5986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.655261218547821, "rewards/margins": 0.904833972454071, "rewards/rejected": -0.24985352158546448, "step": 830 }, { "epoch": 0.22140221402214022, "grad_norm": 119.93633132762334, "learning_rate": 9.447153400105429e-07, "logits/chosen": 0.9052734375, "logits/rejected": 0.78924560546875, "logps/chosen": -367.8500061035156, "logps/rejected": -328.95001220703125, "loss": 0.6215, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.90380859375, "rewards/margins": 0.887939453125, "rewards/rejected": 0.01573486253619194, "step": 840 }, { "epoch": 0.22403795466526094, "grad_norm": 118.88409439302313, "learning_rate": 9.440564048497628e-07, "logits/chosen": 0.926464855670929, "logits/rejected": 0.983593761920929, "logps/chosen": -322.5, "logps/rejected": -320.75, "loss": 0.5989, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.891986072063446, "rewards/margins": 0.8827148675918579, "rewards/rejected": 0.00868835486471653, "step": 850 }, { "epoch": 0.22667369530838166, "grad_norm": 103.33482782478602, "learning_rate": 9.433974696889826e-07, "logits/chosen": 0.9493163824081421, "logits/rejected": 0.9180663824081421, "logps/chosen": -342.70001220703125, "logps/rejected": -333.1000061035156, "loss": 0.7451, "rewards/accuracies": 0.625, "rewards/chosen": 0.9049926996231079, "rewards/margins": 0.596142590045929, "rewards/rejected": 0.3086196780204773, "step": 860 }, { "epoch": 0.22930943595150238, "grad_norm": 127.59194298117791, "learning_rate": 9.427385345282025e-07, "logits/chosen": 0.8921874761581421, "logits/rejected": 0.970898449420929, "logps/chosen": -350.625, "logps/rejected": -299.54998779296875, "loss": 0.5584, "rewards/accuracies": 0.6875, "rewards/chosen": 0.847949206829071, "rewards/margins": 0.821606457233429, "rewards/rejected": 0.027130126953125, "step": 870 }, { "epoch": 0.23194517659462308, "grad_norm": 110.3439626163015, "learning_rate": 9.420795993674222e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.6583251953125, "logps/chosen": -374.79998779296875, "logps/rejected": -348.20001220703125, "loss": 0.5242, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7115234136581421, "rewards/margins": 0.9365234375, "rewards/rejected": -0.22490234673023224, "step": 880 }, { "epoch": 0.2345809172377438, "grad_norm": 121.7137435279708, "learning_rate": 9.41420664206642e-07, "logits/chosen": 0.7950195074081421, "logits/rejected": 0.605621337890625, "logps/chosen": -419.75, "logps/rejected": -333.95001220703125, "loss": 0.5672, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.703125, "rewards/margins": 1.0041992664337158, "rewards/rejected": -0.3008361756801605, "step": 890 }, { "epoch": 0.23721665788086452, "grad_norm": 159.73441479191118, "learning_rate": 9.407617290458618e-07, "logits/chosen": 0.749011218547821, "logits/rejected": 0.739550769329071, "logps/chosen": -391.75, "logps/rejected": -368.1000061035156, "loss": 0.5971, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.676513671875, "rewards/margins": 0.923779308795929, "rewards/rejected": -0.24697265028953552, "step": 900 }, { "epoch": 0.23985239852398524, "grad_norm": 102.08440033918956, "learning_rate": 9.401027938850816e-07, "logits/chosen": 0.8726562261581421, "logits/rejected": 0.913281261920929, "logps/chosen": -309.79998779296875, "logps/rejected": -305.95001220703125, "loss": 0.5341, "rewards/accuracies": 0.75, "rewards/chosen": 1.0759766101837158, "rewards/margins": 1.1321289539337158, "rewards/rejected": -0.05620117112994194, "step": 910 }, { "epoch": 0.24248813916710596, "grad_norm": 138.41931914622003, "learning_rate": 9.394438587243015e-07, "logits/chosen": 0.8721679449081421, "logits/rejected": 0.925732433795929, "logps/chosen": -381.5, "logps/rejected": -360.29998779296875, "loss": 0.5894, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.7943115234375, "rewards/margins": 0.996337890625, "rewards/rejected": -0.20130614936351776, "step": 920 }, { "epoch": 0.24512387981022668, "grad_norm": 133.21580163649944, "learning_rate": 9.387849235635213e-07, "logits/chosen": 0.8067382574081421, "logits/rejected": 0.8517822027206421, "logps/chosen": -377.29998779296875, "logps/rejected": -372.1000061035156, "loss": 0.8152, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.540722668170929, "rewards/margins": 0.4437011778354645, "rewards/rejected": 0.096435546875, "step": 930 }, { "epoch": 0.2477596204533474, "grad_norm": 139.51585283029172, "learning_rate": 9.381259884027412e-07, "logits/chosen": 0.952832043170929, "logits/rejected": 0.952929675579071, "logps/chosen": -396.6499938964844, "logps/rejected": -381.70001220703125, "loss": 0.5934, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5255126953125, "rewards/margins": 0.7482665777206421, "rewards/rejected": -0.22207947075366974, "step": 940 }, { "epoch": 0.2503953610964681, "grad_norm": 148.61481547620104, "learning_rate": 9.374670532419609e-07, "logits/chosen": 0.7479003667831421, "logits/rejected": 0.7457519769668579, "logps/chosen": -310.79998779296875, "logps/rejected": -322.75, "loss": 0.6164, "rewards/accuracies": 0.65625, "rewards/chosen": 0.37612611055374146, "rewards/margins": 0.8268798589706421, "rewards/rejected": -0.45069581270217896, "step": 950 }, { "epoch": 0.2530311017395888, "grad_norm": 157.36968008668546, "learning_rate": 9.368081180811808e-07, "logits/chosen": 0.6635376214981079, "logits/rejected": 0.654003918170929, "logps/chosen": -382.79998779296875, "logps/rejected": -396.29998779296875, "loss": 0.5662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.373587042093277, "rewards/margins": 1.0686523914337158, "rewards/rejected": -0.69512939453125, "step": 960 }, { "epoch": 0.25566684238270954, "grad_norm": 148.1637140569032, "learning_rate": 9.361491829204005e-07, "logits/chosen": 0.74853515625, "logits/rejected": 0.725292980670929, "logps/chosen": -343.04998779296875, "logps/rejected": -322.95001220703125, "loss": 0.5616, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.33308106660842896, "rewards/margins": 0.7152465581893921, "rewards/rejected": -0.38177490234375, "step": 970 }, { "epoch": 0.25830258302583026, "grad_norm": 116.81502893419757, "learning_rate": 9.354902477596204e-07, "logits/chosen": 0.5973418951034546, "logits/rejected": 0.6024414300918579, "logps/chosen": -407.1000061035156, "logps/rejected": -372.79998779296875, "loss": 0.4833, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.46131592988967896, "rewards/margins": 1.150732398033142, "rewards/rejected": -0.6885986328125, "step": 980 }, { "epoch": 0.260938323668951, "grad_norm": 114.59931619973933, "learning_rate": 9.348313125988402e-07, "logits/chosen": 0.8544921875, "logits/rejected": 0.924609363079071, "logps/chosen": -372.0, "logps/rejected": -355.5, "loss": 0.6277, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.42554014921188354, "rewards/margins": 0.68408203125, "rewards/rejected": -0.25849610567092896, "step": 990 }, { "epoch": 0.2635740643120717, "grad_norm": 143.99323274239475, "learning_rate": 9.341723774380601e-07, "logits/chosen": 0.855664074420929, "logits/rejected": 0.827197253704071, "logps/chosen": -379.0, "logps/rejected": -362.04998779296875, "loss": 0.726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6077178716659546, "rewards/margins": 0.45502930879592896, "rewards/rejected": 0.15289306640625, "step": 1000 }, { "epoch": 0.2662098049551924, "grad_norm": 75.6173783985854, "learning_rate": 9.335134422772799e-07, "logits/chosen": 0.9111328125, "logits/rejected": 0.8194824457168579, "logps/chosen": -366.1000061035156, "logps/rejected": -305.1499938964844, "loss": 0.5912, "rewards/accuracies": 0.65625, "rewards/chosen": 0.9005126953125, "rewards/margins": 0.7840210199356079, "rewards/rejected": 0.11655273288488388, "step": 1010 }, { "epoch": 0.26884554559831314, "grad_norm": 86.73990788589121, "learning_rate": 9.328545071164997e-07, "logits/chosen": 1.006982445716858, "logits/rejected": 0.9622558355331421, "logps/chosen": -353.6499938964844, "logps/rejected": -316.8500061035156, "loss": 0.6441, "rewards/accuracies": 0.65625, "rewards/chosen": 0.756103515625, "rewards/margins": 0.8855956792831421, "rewards/rejected": -0.12860412895679474, "step": 1020 }, { "epoch": 0.27148128624143386, "grad_norm": 124.99428288589459, "learning_rate": 9.321955719557195e-07, "logits/chosen": 0.9290527105331421, "logits/rejected": 0.8731445074081421, "logps/chosen": -397.3999938964844, "logps/rejected": -369.5, "loss": 0.5453, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.86956787109375, "rewards/margins": 0.940673828125, "rewards/rejected": -0.0711822509765625, "step": 1030 }, { "epoch": 0.2741170268845546, "grad_norm": 74.10156236223588, "learning_rate": 9.315366367949393e-07, "logits/chosen": 0.8140624761581421, "logits/rejected": 0.797558605670929, "logps/chosen": -368.1499938964844, "logps/rejected": -344.0, "loss": 0.5048, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7738281488418579, "rewards/margins": 1.147680640220642, "rewards/rejected": -0.3733154237270355, "step": 1040 }, { "epoch": 0.2767527675276753, "grad_norm": 87.66360656560568, "learning_rate": 9.308777016341591e-07, "logits/chosen": 0.86083984375, "logits/rejected": 0.8076171875, "logps/chosen": -364.54998779296875, "logps/rejected": -341.04998779296875, "loss": 0.6024, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4393554627895355, "rewards/margins": 0.8891845941543579, "rewards/rejected": -0.44927978515625, "step": 1050 }, { "epoch": 0.27938850817079597, "grad_norm": 191.3102861815495, "learning_rate": 9.30218766473379e-07, "logits/chosen": 0.7977539300918579, "logits/rejected": 0.707714855670929, "logps/chosen": -385.8999938964844, "logps/rejected": -312.45001220703125, "loss": 0.5814, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.626708984375, "rewards/margins": 0.874682605266571, "rewards/rejected": -0.24882812798023224, "step": 1060 }, { "epoch": 0.2820242488139167, "grad_norm": 119.00436642015389, "learning_rate": 9.295598313125988e-07, "logits/chosen": 0.64715576171875, "logits/rejected": 0.672119140625, "logps/chosen": -351.6499938964844, "logps/rejected": -359.95001220703125, "loss": 0.6639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23647460341453552, "rewards/margins": 0.7097412347793579, "rewards/rejected": -0.47247314453125, "step": 1070 }, { "epoch": 0.2846599894570374, "grad_norm": 156.40659546052223, "learning_rate": 9.289008961518187e-07, "logits/chosen": 0.6376953125, "logits/rejected": 0.653369128704071, "logps/chosen": -350.95001220703125, "logps/rejected": -366.1000061035156, "loss": 0.5173, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3167724609375, "rewards/margins": 1.024682641029358, "rewards/rejected": -0.7071288824081421, "step": 1080 }, { "epoch": 0.28729573010015813, "grad_norm": 109.32872670823303, "learning_rate": 9.282419609910384e-07, "logits/chosen": 0.736071765422821, "logits/rejected": 0.637011706829071, "logps/chosen": -378.45001220703125, "logps/rejected": -356.5, "loss": 0.6885, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.288360595703125, "rewards/margins": 0.6290527582168579, "rewards/rejected": -0.34071046113967896, "step": 1090 }, { "epoch": 0.28993147074327885, "grad_norm": 103.8989397603341, "learning_rate": 9.275830258302583e-07, "logits/chosen": 0.836749255657196, "logits/rejected": 0.7803710699081421, "logps/chosen": -380.6000061035156, "logps/rejected": -353.29998779296875, "loss": 0.5287, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.589312732219696, "rewards/margins": 1.024560570716858, "rewards/rejected": -0.4346862733364105, "step": 1100 }, { "epoch": 0.2925672113863996, "grad_norm": 124.33076133790577, "learning_rate": 9.269240906694781e-07, "logits/chosen": 0.823437511920929, "logits/rejected": 0.77862548828125, "logps/chosen": -397.54998779296875, "logps/rejected": -347.8500061035156, "loss": 0.4872, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.78759765625, "rewards/margins": 1.210839867591858, "rewards/rejected": -0.42290037870407104, "step": 1110 }, { "epoch": 0.2952029520295203, "grad_norm": 60.64278940226819, "learning_rate": 9.262651555086979e-07, "logits/chosen": 0.783496081829071, "logits/rejected": 0.8111572265625, "logps/chosen": -384.5, "logps/rejected": -328.3999938964844, "loss": 0.5806, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6327148675918579, "rewards/margins": 0.9817870855331421, "rewards/rejected": -0.34893798828125, "step": 1120 }, { "epoch": 0.297838692672641, "grad_norm": 127.83621534708885, "learning_rate": 9.256062203479177e-07, "logits/chosen": 0.788330078125, "logits/rejected": 0.740673840045929, "logps/chosen": -321.45001220703125, "logps/rejected": -327.6499938964844, "loss": 0.6, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.616259753704071, "rewards/margins": 0.711010754108429, "rewards/rejected": -0.09450683742761612, "step": 1130 }, { "epoch": 0.30047443331576174, "grad_norm": 119.48424005060089, "learning_rate": 9.249472851871375e-07, "logits/chosen": 0.7367187738418579, "logits/rejected": 0.71630859375, "logps/chosen": -320.1000061035156, "logps/rejected": -330.04998779296875, "loss": 0.5537, "rewards/accuracies": 0.71875, "rewards/chosen": 0.733349621295929, "rewards/margins": 0.827136218547821, "rewards/rejected": -0.09355469048023224, "step": 1140 }, { "epoch": 0.30311017395888246, "grad_norm": 122.12411649770877, "learning_rate": 9.242883500263574e-07, "logits/chosen": 0.7007812261581421, "logits/rejected": 0.756396472454071, "logps/chosen": -325.95001220703125, "logps/rejected": -321.3500061035156, "loss": 0.5948, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5941711664199829, "rewards/margins": 0.9048095941543579, "rewards/rejected": -0.3107055723667145, "step": 1150 }, { "epoch": 0.3057459146020032, "grad_norm": 84.77133363959533, "learning_rate": 9.236294148655773e-07, "logits/chosen": 0.688916027545929, "logits/rejected": 0.679492175579071, "logps/chosen": -368.0, "logps/rejected": -347.0, "loss": 0.6432, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5746704339981079, "rewards/margins": 0.760913074016571, "rewards/rejected": -0.18768310546875, "step": 1160 }, { "epoch": 0.3083816552451239, "grad_norm": 175.26590848360377, "learning_rate": 9.22970479704797e-07, "logits/chosen": 0.7371581792831421, "logits/rejected": 0.748339831829071, "logps/chosen": -339.45001220703125, "logps/rejected": -353.5, "loss": 0.5674, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7640746831893921, "rewards/margins": 0.908154308795929, "rewards/rejected": -0.14364013075828552, "step": 1170 }, { "epoch": 0.3110173958882446, "grad_norm": 154.38997138156236, "learning_rate": 9.223115445440169e-07, "logits/chosen": 0.781445324420929, "logits/rejected": 0.8192383050918579, "logps/chosen": -373.8500061035156, "logps/rejected": -377.95001220703125, "loss": 0.534, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7945312261581421, "rewards/margins": 1.131738305091858, "rewards/rejected": -0.33686524629592896, "step": 1180 }, { "epoch": 0.31365313653136534, "grad_norm": 115.83475028713559, "learning_rate": 9.216526093832366e-07, "logits/chosen": 0.783203125, "logits/rejected": 0.8030761480331421, "logps/chosen": -371.95001220703125, "logps/rejected": -377.6000061035156, "loss": 0.5732, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4174743592739105, "rewards/margins": 0.890911877155304, "rewards/rejected": -0.47333985567092896, "step": 1190 }, { "epoch": 0.316288877174486, "grad_norm": 200.24549692509243, "learning_rate": 9.209936742224565e-07, "logits/chosen": 0.785473644733429, "logits/rejected": 0.755566418170929, "logps/chosen": -401.1499938964844, "logps/rejected": -356.1000061035156, "loss": 0.7109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4524169862270355, "rewards/margins": 0.7278198003768921, "rewards/rejected": -0.27656251192092896, "step": 1200 }, { "epoch": 0.3189246178176067, "grad_norm": 115.92625348504299, "learning_rate": 9.203347390616762e-07, "logits/chosen": 0.720947265625, "logits/rejected": 0.774218738079071, "logps/chosen": -351.5, "logps/rejected": -363.0, "loss": 0.533, "rewards/accuracies": 0.71875, "rewards/chosen": 0.705126941204071, "rewards/margins": 0.9720703363418579, "rewards/rejected": -0.2668518126010895, "step": 1210 }, { "epoch": 0.32156035846072745, "grad_norm": 116.31814464813371, "learning_rate": 9.196758039008962e-07, "logits/chosen": 0.697216808795929, "logits/rejected": 0.84228515625, "logps/chosen": -323.57501220703125, "logps/rejected": -299.3500061035156, "loss": 0.5502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7698730230331421, "rewards/margins": 0.9715331792831421, "rewards/rejected": -0.20074462890625, "step": 1220 }, { "epoch": 0.32419609910384817, "grad_norm": 105.01957153986376, "learning_rate": 9.19016868740116e-07, "logits/chosen": 0.745068371295929, "logits/rejected": 0.699999988079071, "logps/chosen": -379.0, "logps/rejected": -363.95001220703125, "loss": 0.5606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6520630121231079, "rewards/margins": 1.185083031654358, "rewards/rejected": -0.5328369140625, "step": 1230 }, { "epoch": 0.3268318397469689, "grad_norm": 101.97447719636715, "learning_rate": 9.183579335793357e-07, "logits/chosen": 0.719799816608429, "logits/rejected": 0.6653076410293579, "logps/chosen": -393.04998779296875, "logps/rejected": -368.3999938964844, "loss": 0.6652, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.43305665254592896, "rewards/margins": 0.8941894769668579, "rewards/rejected": -0.4621337950229645, "step": 1240 }, { "epoch": 0.3294675803900896, "grad_norm": 158.04641358665336, "learning_rate": 9.176989984185556e-07, "logits/chosen": 0.6685791015625, "logits/rejected": 0.734667956829071, "logps/chosen": -330.0, "logps/rejected": -354.29998779296875, "loss": 0.6495, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6243896484375, "rewards/margins": 0.6799072027206421, "rewards/rejected": -0.05656738206744194, "step": 1250 }, { "epoch": 0.33210332103321033, "grad_norm": 169.7631818045456, "learning_rate": 9.170400632577753e-07, "logits/chosen": 0.656982421875, "logits/rejected": 0.622265636920929, "logps/chosen": -413.1499938964844, "logps/rejected": -365.6000061035156, "loss": 0.5262, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.594482421875, "rewards/margins": 1.1302978992462158, "rewards/rejected": -0.5357910394668579, "step": 1260 }, { "epoch": 0.33473906167633105, "grad_norm": 106.84148418787453, "learning_rate": 9.163811280969952e-07, "logits/chosen": 0.785351574420929, "logits/rejected": 0.78662109375, "logps/chosen": -356.3999938964844, "logps/rejected": -360.70001220703125, "loss": 0.6564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5738525390625, "rewards/margins": 0.86962890625, "rewards/rejected": -0.2952880859375, "step": 1270 }, { "epoch": 0.33737480231945177, "grad_norm": 114.8072555231451, "learning_rate": 9.15722192936215e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.755444347858429, "logps/chosen": -309.04998779296875, "logps/rejected": -310.3500061035156, "loss": 0.5855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.681622326374054, "rewards/margins": 0.887988269329071, "rewards/rejected": -0.20627442002296448, "step": 1280 }, { "epoch": 0.3400105429625725, "grad_norm": 194.29900799132116, "learning_rate": 9.150632577754348e-07, "logits/chosen": 0.782470703125, "logits/rejected": 0.748242199420929, "logps/chosen": -384.04998779296875, "logps/rejected": -366.04998779296875, "loss": 0.6877, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5123840570449829, "rewards/margins": 0.724591076374054, "rewards/rejected": -0.21186523139476776, "step": 1290 }, { "epoch": 0.3426462836056932, "grad_norm": 110.90477232372977, "learning_rate": 9.144043226146547e-07, "logits/chosen": 0.845898449420929, "logits/rejected": 0.846118152141571, "logps/chosen": -349.3500061035156, "logps/rejected": -344.54998779296875, "loss": 0.5823, "rewards/accuracies": 0.6875, "rewards/chosen": 0.618847668170929, "rewards/margins": 0.899279773235321, "rewards/rejected": -0.2806030213832855, "step": 1300 }, { "epoch": 0.34528202424881393, "grad_norm": 107.87804398249395, "learning_rate": 9.137453874538745e-07, "logits/chosen": 0.7548828125, "logits/rejected": 0.7461913824081421, "logps/chosen": -381.1000061035156, "logps/rejected": -310.75, "loss": 0.5742, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6211273074150085, "rewards/margins": 0.808697521686554, "rewards/rejected": -0.18770751357078552, "step": 1310 }, { "epoch": 0.34791776489193466, "grad_norm": 155.8316983814618, "learning_rate": 9.130864522930943e-07, "logits/chosen": 0.741503894329071, "logits/rejected": 0.69244384765625, "logps/chosen": -374.95001220703125, "logps/rejected": -327.95001220703125, "loss": 0.5894, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.58795166015625, "rewards/margins": 1.043615698814392, "rewards/rejected": -0.45573729276657104, "step": 1320 }, { "epoch": 0.3505535055350554, "grad_norm": 98.24465565696505, "learning_rate": 9.124275171323142e-07, "logits/chosen": 0.887890636920929, "logits/rejected": 0.748779296875, "logps/chosen": -334.1499938964844, "logps/rejected": -302.1499938964844, "loss": 0.6667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.44721680879592896, "rewards/margins": 0.7411133050918579, "rewards/rejected": -0.29291993379592896, "step": 1330 }, { "epoch": 0.35318924617817604, "grad_norm": 79.86485694031194, "learning_rate": 9.117685819715339e-07, "logits/chosen": 1.036523461341858, "logits/rejected": 0.9609375, "logps/chosen": -375.3500061035156, "logps/rejected": -358.04998779296875, "loss": 0.6332, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.928662121295929, "rewards/margins": 0.845166027545929, "rewards/rejected": 0.08344421535730362, "step": 1340 }, { "epoch": 0.35582498682129676, "grad_norm": 139.99695860125507, "learning_rate": 9.111096468107538e-07, "logits/chosen": 0.880859375, "logits/rejected": 0.9452148675918579, "logps/chosen": -381.29998779296875, "logps/rejected": -357.5, "loss": 0.5818, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.9588378667831421, "rewards/margins": 0.854931652545929, "rewards/rejected": 0.10378418117761612, "step": 1350 }, { "epoch": 0.3584607274644175, "grad_norm": 146.17354657461303, "learning_rate": 9.104507116499735e-07, "logits/chosen": 0.78271484375, "logits/rejected": 0.602832019329071, "logps/chosen": -407.54998779296875, "logps/rejected": -318.5, "loss": 0.6532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.577868640422821, "rewards/margins": 0.8050781488418579, "rewards/rejected": -0.2264404296875, "step": 1360 }, { "epoch": 0.3610964681075382, "grad_norm": 117.30646855701293, "learning_rate": 9.097917764891935e-07, "logits/chosen": 0.8056640625, "logits/rejected": 0.6501220464706421, "logps/chosen": -424.79998779296875, "logps/rejected": -333.79998779296875, "loss": 0.5288, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5378783941268921, "rewards/margins": 1.0478515625, "rewards/rejected": -0.5090576410293579, "step": 1370 }, { "epoch": 0.3637322087506589, "grad_norm": 165.79161556331522, "learning_rate": 9.091328413284133e-07, "logits/chosen": 0.8794921636581421, "logits/rejected": 0.7666991949081421, "logps/chosen": -360.1499938964844, "logps/rejected": -340.1499938964844, "loss": 0.4983, "rewards/accuracies": 0.78125, "rewards/chosen": 1.155664086341858, "rewards/margins": 1.307031273841858, "rewards/rejected": -0.15137329697608948, "step": 1380 }, { "epoch": 0.36636794939377965, "grad_norm": 183.89311357056096, "learning_rate": 9.084739061676331e-07, "logits/chosen": 0.842968761920929, "logits/rejected": 0.867968738079071, "logps/chosen": -346.20001220703125, "logps/rejected": -328.54998779296875, "loss": 0.6743, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5744384527206421, "rewards/margins": 0.686535656452179, "rewards/rejected": -0.11232910305261612, "step": 1390 }, { "epoch": 0.36900369003690037, "grad_norm": 95.79971554297174, "learning_rate": 9.078149710068529e-07, "logits/chosen": 0.9419921636581421, "logits/rejected": 0.819580078125, "logps/chosen": -401.20001220703125, "logps/rejected": -349.0, "loss": 0.5828, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.837939441204071, "rewards/margins": 0.8355468511581421, "rewards/rejected": 0.001953125, "step": 1400 }, { "epoch": 0.3716394306800211, "grad_norm": 113.20108566826646, "learning_rate": 9.071560358460727e-07, "logits/chosen": 0.909472644329071, "logits/rejected": 0.817944347858429, "logps/chosen": -382.45001220703125, "logps/rejected": -325.8999938964844, "loss": 0.5926, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.881298840045929, "rewards/margins": 0.9349609613418579, "rewards/rejected": -0.05355224758386612, "step": 1410 }, { "epoch": 0.3742751713231418, "grad_norm": 99.49754921355128, "learning_rate": 9.064971006852925e-07, "logits/chosen": 0.91357421875, "logits/rejected": 0.9957031011581421, "logps/chosen": -388.25, "logps/rejected": -363.54998779296875, "loss": 0.5604, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.0007812976837158, "rewards/margins": 0.9629272222518921, "rewards/rejected": 0.03815307468175888, "step": 1420 }, { "epoch": 0.37691091196626253, "grad_norm": 112.80288155975659, "learning_rate": 9.058381655245123e-07, "logits/chosen": 0.822949230670929, "logits/rejected": 0.7623046636581421, "logps/chosen": -300.8500061035156, "logps/rejected": -278.1000061035156, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": 0.937792956829071, "rewards/margins": 0.6947021484375, "rewards/rejected": 0.24336548149585724, "step": 1430 }, { "epoch": 0.37954665260938325, "grad_norm": 106.53930612381137, "learning_rate": 9.051792303637321e-07, "logits/chosen": 0.9341796636581421, "logits/rejected": 0.840380847454071, "logps/chosen": -430.04998779296875, "logps/rejected": -368.45001220703125, "loss": 0.5196, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.40252685546875, "rewards/margins": 1.1318848133087158, "rewards/rejected": 0.26959228515625, "step": 1440 }, { "epoch": 0.38218239325250397, "grad_norm": 77.07944584365102, "learning_rate": 9.045202952029521e-07, "logits/chosen": 0.8521484136581421, "logits/rejected": 0.8363281488418579, "logps/chosen": -342.45001220703125, "logps/rejected": -337.54998779296875, "loss": 0.4983, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.1025390625, "rewards/margins": 1.0810546875, "rewards/rejected": 0.02092285081744194, "step": 1450 }, { "epoch": 0.3848181338956247, "grad_norm": 176.01049518882465, "learning_rate": 9.038613600421718e-07, "logits/chosen": 0.8428710699081421, "logits/rejected": 0.8583008050918579, "logps/chosen": -355.7250061035156, "logps/rejected": -314.8500061035156, "loss": 0.5898, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.701416015625, "rewards/margins": 0.892578125, "rewards/rejected": -0.19101563096046448, "step": 1460 }, { "epoch": 0.3874538745387454, "grad_norm": 88.576139722906, "learning_rate": 9.032024248813917e-07, "logits/chosen": 0.760693371295929, "logits/rejected": 0.675732433795929, "logps/chosen": -381.54998779296875, "logps/rejected": -348.5, "loss": 0.5368, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7752624750137329, "rewards/margins": 1.3823363780975342, "rewards/rejected": -0.6065918207168579, "step": 1470 }, { "epoch": 0.3900896151818661, "grad_norm": 96.92802334981937, "learning_rate": 9.025434897206114e-07, "logits/chosen": 0.67083740234375, "logits/rejected": 0.660900890827179, "logps/chosen": -401.0, "logps/rejected": -386.29998779296875, "loss": 0.6299, "rewards/accuracies": 0.71875, "rewards/chosen": 0.76953125, "rewards/margins": 1.0395996570587158, "rewards/rejected": -0.26920777559280396, "step": 1480 }, { "epoch": 0.3927253558249868, "grad_norm": 108.439095390184, "learning_rate": 9.018845545598313e-07, "logits/chosen": 0.7564941644668579, "logits/rejected": 0.724414050579071, "logps/chosen": -376.75, "logps/rejected": -346.3999938964844, "loss": 0.6606, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.638964831829071, "rewards/margins": 0.7603759765625, "rewards/rejected": -0.12060546875, "step": 1490 }, { "epoch": 0.3953610964681075, "grad_norm": 116.23331575049392, "learning_rate": 9.012256193990511e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.8084472417831421, "logps/chosen": -344.54998779296875, "logps/rejected": -330.04998779296875, "loss": 0.5634, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6576172113418579, "rewards/margins": 1.2001464366912842, "rewards/rejected": -0.543408215045929, "step": 1500 }, { "epoch": 0.39799683711122824, "grad_norm": 157.74163133763466, "learning_rate": 9.005666842382709e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.711132824420929, "logps/chosen": -358.5, "logps/rejected": -302.75, "loss": 0.59, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.8006836175918579, "rewards/margins": 1.0005614757537842, "rewards/rejected": -0.20032349228858948, "step": 1510 }, { "epoch": 0.40063257775434896, "grad_norm": 92.53329621803883, "learning_rate": 8.999077490774908e-07, "logits/chosen": 0.916210949420929, "logits/rejected": 0.842968761920929, "logps/chosen": -340.8500061035156, "logps/rejected": -307.1499938964844, "loss": 0.599, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.8841308355331421, "rewards/margins": 0.98388671875, "rewards/rejected": -0.0982666015625, "step": 1520 }, { "epoch": 0.4032683183974697, "grad_norm": 142.25727252821756, "learning_rate": 8.992488139167106e-07, "logits/chosen": 0.9462890625, "logits/rejected": 1.0251953601837158, "logps/chosen": -338.79998779296875, "logps/rejected": -319.20001220703125, "loss": 0.6307, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2380859851837158, "rewards/margins": 1.095312476158142, "rewards/rejected": 0.14508667588233948, "step": 1530 }, { "epoch": 0.4059040590405904, "grad_norm": 68.5673750452239, "learning_rate": 8.985898787559304e-07, "logits/chosen": 1.0648925304412842, "logits/rejected": 1.028906226158142, "logps/chosen": -337.8999938964844, "logps/rejected": -298.6499938964844, "loss": 0.938, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7266601324081421, "rewards/margins": 0.4948486387729645, "rewards/rejected": 0.23270264267921448, "step": 1540 }, { "epoch": 0.4085397996837111, "grad_norm": 123.30616906142332, "learning_rate": 8.979309435951502e-07, "logits/chosen": 0.9593750238418579, "logits/rejected": 0.9354492425918579, "logps/chosen": -354.20001220703125, "logps/rejected": -346.1000061035156, "loss": 0.5078, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.109375, "rewards/margins": 1.0419921875, "rewards/rejected": 0.0675048828125, "step": 1550 }, { "epoch": 0.41117554032683185, "grad_norm": 110.91530594437498, "learning_rate": 8.9727200843437e-07, "logits/chosen": 0.7389281988143921, "logits/rejected": 0.6570800542831421, "logps/chosen": -375.20001220703125, "logps/rejected": -373.92498779296875, "loss": 0.54, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6449218988418579, "rewards/margins": 1.049218773841858, "rewards/rejected": -0.403533935546875, "step": 1560 }, { "epoch": 0.41381128096995257, "grad_norm": 163.51372341800158, "learning_rate": 8.966130732735899e-07, "logits/chosen": 0.748583972454071, "logits/rejected": 0.568743884563446, "logps/chosen": -391.8500061035156, "logps/rejected": -352.70001220703125, "loss": 0.6282, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.28104859590530396, "rewards/margins": 0.8446289300918579, "rewards/rejected": -0.562939465045929, "step": 1570 }, { "epoch": 0.4164470216130733, "grad_norm": 112.30237484621573, "learning_rate": 8.959541381128096e-07, "logits/chosen": 0.884082019329071, "logits/rejected": 0.787890613079071, "logps/chosen": -412.5, "logps/rejected": -366.6000061035156, "loss": 0.5528, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6893554925918579, "rewards/margins": 1.081701636314392, "rewards/rejected": -0.3919433653354645, "step": 1580 }, { "epoch": 0.419082762256194, "grad_norm": 97.95393691257765, "learning_rate": 8.952952029520294e-07, "logits/chosen": 0.924121081829071, "logits/rejected": 0.8671875, "logps/chosen": -417.25, "logps/rejected": -366.45001220703125, "loss": 0.5179, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.0999023914337158, "rewards/margins": 1.1774413585662842, "rewards/rejected": -0.07860717922449112, "step": 1590 }, { "epoch": 0.42171850289931473, "grad_norm": 92.56737458057482, "learning_rate": 8.946362677912493e-07, "logits/chosen": 1.121484398841858, "logits/rejected": 0.9996093511581421, "logps/chosen": -361.29998779296875, "logps/rejected": -319.6499938964844, "loss": 0.6219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.292382836341858, "rewards/margins": 1.09765625, "rewards/rejected": 0.19541625678539276, "step": 1600 }, { "epoch": 0.42435424354243545, "grad_norm": 129.41639681692743, "learning_rate": 8.939773326304691e-07, "logits/chosen": 0.9892578125, "logits/rejected": 1.116601586341858, "logps/chosen": -391.8500061035156, "logps/rejected": -342.8500061035156, "loss": 0.5335, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.6238281726837158, "rewards/margins": 1.148168921470642, "rewards/rejected": 0.47663575410842896, "step": 1610 }, { "epoch": 0.4269899841855561, "grad_norm": 110.08179791066216, "learning_rate": 8.93318397469689e-07, "logits/chosen": 1.0275390148162842, "logits/rejected": 0.9957031011581421, "logps/chosen": -358.6000061035156, "logps/rejected": -349.95001220703125, "loss": 0.6112, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.556054711341858, "rewards/margins": 0.947460949420929, "rewards/rejected": 0.6097167730331421, "step": 1620 }, { "epoch": 0.42962572482867684, "grad_norm": 154.01834428428666, "learning_rate": 8.926594623089087e-07, "logits/chosen": 1.1242187023162842, "logits/rejected": 0.9706054925918579, "logps/chosen": -387.1499938964844, "logps/rejected": -340.20001220703125, "loss": 0.6445, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.1309325695037842, "rewards/margins": 0.7833251953125, "rewards/rejected": 0.34788817167282104, "step": 1630 }, { "epoch": 0.43226146547179756, "grad_norm": 154.44687133261357, "learning_rate": 8.920005271481286e-07, "logits/chosen": 0.8194335699081421, "logits/rejected": 0.7684570550918579, "logps/chosen": -379.2749938964844, "logps/rejected": -338.375, "loss": 0.6195, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1452147960662842, "rewards/margins": 1.13671875, "rewards/rejected": 0.007214355282485485, "step": 1640 }, { "epoch": 0.4348972061149183, "grad_norm": 96.69262004252926, "learning_rate": 8.913415919873483e-07, "logits/chosen": 0.82623291015625, "logits/rejected": 0.685107409954071, "logps/chosen": -343.20001220703125, "logps/rejected": -332.95001220703125, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": 1.0631835460662842, "rewards/margins": 1.1551513671875, "rewards/rejected": -0.0911865234375, "step": 1650 }, { "epoch": 0.437532946758039, "grad_norm": 132.10739435193378, "learning_rate": 8.906826568265682e-07, "logits/chosen": 0.760693371295929, "logits/rejected": 0.694091796875, "logps/chosen": -353.8500061035156, "logps/rejected": -322.75, "loss": 0.7035, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.933398425579071, "rewards/margins": 0.6878417730331421, "rewards/rejected": 0.2454833984375, "step": 1660 }, { "epoch": 0.4401686874011597, "grad_norm": 150.60290460511956, "learning_rate": 8.900237216657881e-07, "logits/chosen": 0.874804675579071, "logits/rejected": 0.8836914300918579, "logps/chosen": -344.79998779296875, "logps/rejected": -321.79998779296875, "loss": 0.5862, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0520508289337158, "rewards/margins": 0.863818347454071, "rewards/rejected": 0.189208984375, "step": 1670 }, { "epoch": 0.44280442804428044, "grad_norm": 155.45056513298545, "learning_rate": 8.893647865050079e-07, "logits/chosen": 0.775585949420929, "logits/rejected": 0.804492175579071, "logps/chosen": -348.45001220703125, "logps/rejected": -342.20001220703125, "loss": 0.6879, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.826464831829071, "rewards/margins": 0.696972668170929, "rewards/rejected": 0.1298828125, "step": 1680 }, { "epoch": 0.44544016868740116, "grad_norm": 141.0372992345537, "learning_rate": 8.887058513442277e-07, "logits/chosen": 0.948535144329071, "logits/rejected": 0.953076183795929, "logps/chosen": -379.67498779296875, "logps/rejected": -352.8500061035156, "loss": 0.6138, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.02069091796875, "rewards/margins": 0.908215343952179, "rewards/rejected": 0.11327514797449112, "step": 1690 }, { "epoch": 0.4480759093305219, "grad_norm": 156.43145407724361, "learning_rate": 8.880469161834475e-07, "logits/chosen": 0.973095715045929, "logits/rejected": 0.825634777545929, "logps/chosen": -348.1000061035156, "logps/rejected": -321.375, "loss": 0.679, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.6850951910018921, "rewards/margins": 0.697949230670929, "rewards/rejected": -0.012332153506577015, "step": 1700 }, { "epoch": 0.4507116499736426, "grad_norm": 87.5983076873199, "learning_rate": 8.873879810226673e-07, "logits/chosen": 0.927685558795929, "logits/rejected": 0.93115234375, "logps/chosen": -368.04998779296875, "logps/rejected": -335.3999938964844, "loss": 0.5881, "rewards/accuracies": 0.6875, "rewards/chosen": 1.035253882408142, "rewards/margins": 0.823486328125, "rewards/rejected": 0.21247558295726776, "step": 1710 }, { "epoch": 0.4533473906167633, "grad_norm": 112.179514058547, "learning_rate": 8.867290458618871e-07, "logits/chosen": 0.874218761920929, "logits/rejected": 0.8067382574081421, "logps/chosen": -368.375, "logps/rejected": -335.75, "loss": 0.6121, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.1123535633087158, "rewards/margins": 0.8773193359375, "rewards/rejected": 0.23378296196460724, "step": 1720 }, { "epoch": 0.45598313125988404, "grad_norm": 121.84653906520444, "learning_rate": 8.860701107011069e-07, "logits/chosen": 0.9111328125, "logits/rejected": 0.8140624761581421, "logps/chosen": -460.75, "logps/rejected": -381.5, "loss": 0.5591, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.89703369140625, "rewards/margins": 1.210107445716858, "rewards/rejected": -0.312744140625, "step": 1730 }, { "epoch": 0.45861887190300477, "grad_norm": 114.80305041027107, "learning_rate": 8.854111755403268e-07, "logits/chosen": 0.7767578363418579, "logits/rejected": 0.7957519292831421, "logps/chosen": -336.8500061035156, "logps/rejected": -321.20001220703125, "loss": 0.605, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.0812256336212158, "rewards/margins": 0.939160168170929, "rewards/rejected": 0.141387939453125, "step": 1740 }, { "epoch": 0.4612546125461255, "grad_norm": 118.57395126353, "learning_rate": 8.847522403795466e-07, "logits/chosen": 0.973828136920929, "logits/rejected": 0.881542980670929, "logps/chosen": -378.75, "logps/rejected": -413.54998779296875, "loss": 0.5817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1736328601837158, "rewards/margins": 1.4208984375, "rewards/rejected": -0.2469482421875, "step": 1750 }, { "epoch": 0.46389035318924615, "grad_norm": 102.79459315239511, "learning_rate": 8.840933052187665e-07, "logits/chosen": 0.954296886920929, "logits/rejected": 0.8027099370956421, "logps/chosen": -333.6000061035156, "logps/rejected": -311.25, "loss": 0.7111, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.621508777141571, "rewards/margins": 0.606616199016571, "rewards/rejected": 0.013653564266860485, "step": 1760 }, { "epoch": 0.4665260938323669, "grad_norm": 131.9570146851555, "learning_rate": 8.834343700579862e-07, "logits/chosen": 0.7140868902206421, "logits/rejected": 0.623242199420929, "logps/chosen": -371.29998779296875, "logps/rejected": -370.75, "loss": 0.6153, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.926440417766571, "rewards/margins": 0.8843749761581421, "rewards/rejected": 0.04213867336511612, "step": 1770 }, { "epoch": 0.4691618344754876, "grad_norm": 179.2886277501807, "learning_rate": 8.827754348972061e-07, "logits/chosen": 0.6734374761581421, "logits/rejected": 0.5476318597793579, "logps/chosen": -392.29998779296875, "logps/rejected": -401.0, "loss": 0.6021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.697338879108429, "rewards/margins": 1.0985596179962158, "rewards/rejected": -0.40058594942092896, "step": 1780 }, { "epoch": 0.4717975751186083, "grad_norm": 75.29945681477304, "learning_rate": 8.821164997364259e-07, "logits/chosen": 0.8192383050918579, "logits/rejected": 0.759326159954071, "logps/chosen": -380.125, "logps/rejected": -346.3500061035156, "loss": 0.5529, "rewards/accuracies": 0.75, "rewards/chosen": 0.6876891851425171, "rewards/margins": 1.1864745616912842, "rewards/rejected": -0.49846190214157104, "step": 1790 }, { "epoch": 0.47443331576172904, "grad_norm": 122.33576597830447, "learning_rate": 8.814575645756457e-07, "logits/chosen": 0.812207043170929, "logits/rejected": 0.73193359375, "logps/chosen": -328.6000061035156, "logps/rejected": -345.25, "loss": 0.6118, "rewards/accuracies": 0.6875, "rewards/chosen": 0.536303699016571, "rewards/margins": 0.978222668170929, "rewards/rejected": -0.4423980712890625, "step": 1800 }, { "epoch": 0.47706905640484976, "grad_norm": 98.10184661547818, "learning_rate": 8.807986294148655e-07, "logits/chosen": 0.9385741949081421, "logits/rejected": 0.707324206829071, "logps/chosen": -347.8999938964844, "logps/rejected": -369.1000061035156, "loss": 0.5895, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6557372808456421, "rewards/margins": 0.9104369878768921, "rewards/rejected": -0.2542968690395355, "step": 1810 }, { "epoch": 0.4797047970479705, "grad_norm": 127.0246063580091, "learning_rate": 8.801396942540854e-07, "logits/chosen": 0.931835949420929, "logits/rejected": 0.821972668170929, "logps/chosen": -393.3500061035156, "logps/rejected": -362.04998779296875, "loss": 0.5778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4791503846645355, "rewards/margins": 1.090087890625, "rewards/rejected": -0.6118835210800171, "step": 1820 }, { "epoch": 0.4823405376910912, "grad_norm": 102.64294143226114, "learning_rate": 8.794807590933052e-07, "logits/chosen": 0.741406261920929, "logits/rejected": 0.6942383050918579, "logps/chosen": -331.04998779296875, "logps/rejected": -328.5, "loss": 0.5869, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4540161192417145, "rewards/margins": 0.929248034954071, "rewards/rejected": -0.47630614042282104, "step": 1830 }, { "epoch": 0.4849762783342119, "grad_norm": 104.36024624589888, "learning_rate": 8.788218239325251e-07, "logits/chosen": 0.9540039300918579, "logits/rejected": 0.870312511920929, "logps/chosen": -368.6499938964844, "logps/rejected": -365.0, "loss": 0.5912, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4742675721645355, "rewards/margins": 0.8700927495956421, "rewards/rejected": -0.39625245332717896, "step": 1840 }, { "epoch": 0.48761201897733264, "grad_norm": 104.56219397457592, "learning_rate": 8.781628887717448e-07, "logits/chosen": 0.794628918170929, "logits/rejected": 0.6727050542831421, "logps/chosen": -404.54998779296875, "logps/rejected": -363.3500061035156, "loss": 0.5398, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.39483642578125, "rewards/margins": 1.0853760242462158, "rewards/rejected": -0.691699206829071, "step": 1850 }, { "epoch": 0.49024775962045336, "grad_norm": 174.4242705419399, "learning_rate": 8.775039536109647e-07, "logits/chosen": 0.6243652105331421, "logits/rejected": 0.6126464605331421, "logps/chosen": -407.8999938964844, "logps/rejected": -373.20001220703125, "loss": 0.6219, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16447143256664276, "rewards/margins": 1.091162085533142, "rewards/rejected": -0.926849365234375, "step": 1860 }, { "epoch": 0.4928835002635741, "grad_norm": 149.58508179307944, "learning_rate": 8.768450184501844e-07, "logits/chosen": 0.697705090045929, "logits/rejected": 0.6221679449081421, "logps/chosen": -328.6000061035156, "logps/rejected": -320.25, "loss": 0.685, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08653564751148224, "rewards/margins": 0.9156860113143921, "rewards/rejected": -1.001440405845642, "step": 1870 }, { "epoch": 0.4955192409066948, "grad_norm": 96.40909914432861, "learning_rate": 8.761860832894043e-07, "logits/chosen": 0.727099597454071, "logits/rejected": 0.7153075933456421, "logps/chosen": -358.8999938964844, "logps/rejected": -326.7250061035156, "loss": 0.6447, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.361480712890625, "rewards/margins": 0.983325183391571, "rewards/rejected": -0.6215850710868835, "step": 1880 }, { "epoch": 0.4981549815498155, "grad_norm": 174.23711113249243, "learning_rate": 8.75527148128624e-07, "logits/chosen": 0.937695324420929, "logits/rejected": 0.8568359613418579, "logps/chosen": -399.6499938964844, "logps/rejected": -352.2250061035156, "loss": 0.5558, "rewards/accuracies": 0.6875, "rewards/chosen": 0.664294421672821, "rewards/margins": 1.072509765625, "rewards/rejected": -0.4076080322265625, "step": 1890 }, { "epoch": 0.5007907221929362, "grad_norm": 104.17485066765175, "learning_rate": 8.74868212967844e-07, "logits/chosen": 0.9443359375, "logits/rejected": 0.9779297113418579, "logps/chosen": -383.5249938964844, "logps/rejected": -341.8999938964844, "loss": 0.5151, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5769287347793579, "rewards/margins": 1.1505858898162842, "rewards/rejected": -0.5737549066543579, "step": 1900 }, { "epoch": 0.503426462836057, "grad_norm": 79.53325629328891, "learning_rate": 8.742092778070638e-07, "logits/chosen": 0.855273425579071, "logits/rejected": 0.723864734172821, "logps/chosen": -353.79998779296875, "logps/rejected": -323.29998779296875, "loss": 0.5286, "rewards/accuracies": 0.6875, "rewards/chosen": 0.519580066204071, "rewards/margins": 1.0422852039337158, "rewards/rejected": -0.5225464105606079, "step": 1910 }, { "epoch": 0.5060622034791776, "grad_norm": 99.62990413275413, "learning_rate": 8.735503426462836e-07, "logits/chosen": 0.957226574420929, "logits/rejected": 0.8756347894668579, "logps/chosen": -359.5249938964844, "logps/rejected": -368.79998779296875, "loss": 0.6175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5185302495956421, "rewards/margins": 0.9916747808456421, "rewards/rejected": -0.4727844297885895, "step": 1920 }, { "epoch": 0.5086979441222984, "grad_norm": 84.97672727475553, "learning_rate": 8.728914074855034e-07, "logits/chosen": 0.969042956829071, "logits/rejected": 1.0148437023162842, "logps/chosen": -371.75, "logps/rejected": -354.54998779296875, "loss": 0.4999, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.789379894733429, "rewards/margins": 1.2113769054412842, "rewards/rejected": -0.4222168028354645, "step": 1930 }, { "epoch": 0.5113336847654191, "grad_norm": 100.8406309167175, "learning_rate": 8.722324723247231e-07, "logits/chosen": 0.7600616216659546, "logits/rejected": 0.795703113079071, "logps/chosen": -339.1499938964844, "logps/rejected": -332.45001220703125, "loss": 0.5753, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3880371153354645, "rewards/margins": 1.0784180164337158, "rewards/rejected": -0.6902831792831421, "step": 1940 }, { "epoch": 0.5139694254085398, "grad_norm": 135.6355847317415, "learning_rate": 8.71573537163943e-07, "logits/chosen": 0.770068347454071, "logits/rejected": 0.763134777545929, "logps/chosen": -366.3500061035156, "logps/rejected": -340.8500061035156, "loss": 0.7228, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06081543117761612, "rewards/margins": 0.7760009765625, "rewards/rejected": -0.836669921875, "step": 1950 }, { "epoch": 0.5166051660516605, "grad_norm": 151.59815684865595, "learning_rate": 8.709146020031628e-07, "logits/chosen": 0.675341784954071, "logits/rejected": 0.7724609375, "logps/chosen": -348.6000061035156, "logps/rejected": -320.75, "loss": 0.5736, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.2658935487270355, "rewards/margins": 0.983154296875, "rewards/rejected": -0.7170165777206421, "step": 1960 }, { "epoch": 0.5192409066947812, "grad_norm": 89.62422985123334, "learning_rate": 8.702556668423827e-07, "logits/chosen": 0.6455078125, "logits/rejected": 0.5783447027206421, "logps/chosen": -370.8999938964844, "logps/rejected": -353.70001220703125, "loss": 0.5574, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23833008110523224, "rewards/margins": 1.122949242591858, "rewards/rejected": -0.8848327398300171, "step": 1970 }, { "epoch": 0.521876647337902, "grad_norm": 98.12831813463168, "learning_rate": 8.695967316816025e-07, "logits/chosen": 0.700390636920929, "logits/rejected": 0.7972656488418579, "logps/chosen": -325.6000061035156, "logps/rejected": -320.70001220703125, "loss": 0.5605, "rewards/accuracies": 0.71875, "rewards/chosen": 0.601391613483429, "rewards/margins": 0.9454101324081421, "rewards/rejected": -0.34404295682907104, "step": 1980 }, { "epoch": 0.5245123879810226, "grad_norm": 162.67437538094288, "learning_rate": 8.689377965208223e-07, "logits/chosen": 0.6695801019668579, "logits/rejected": 0.689160168170929, "logps/chosen": -347.0, "logps/rejected": -373.1000061035156, "loss": 0.6075, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2801757752895355, "rewards/margins": 0.8503662347793579, "rewards/rejected": -0.5699828863143921, "step": 1990 }, { "epoch": 0.5271481286241434, "grad_norm": 102.41840890097387, "learning_rate": 8.682788613600421e-07, "logits/chosen": 0.9305664300918579, "logits/rejected": 0.85791015625, "logps/chosen": -346.95001220703125, "logps/rejected": -344.6000061035156, "loss": 0.6272, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.730761706829071, "rewards/margins": 0.993725597858429, "rewards/rejected": -0.2628540098667145, "step": 2000 }, { "epoch": 0.5297838692672641, "grad_norm": 127.09690065586555, "learning_rate": 8.67619926199262e-07, "logits/chosen": 0.8280273675918579, "logits/rejected": 0.857617199420929, "logps/chosen": -356.29998779296875, "logps/rejected": -339.54998779296875, "loss": 0.6611, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7518554925918579, "rewards/margins": 0.8392333984375, "rewards/rejected": -0.08702392876148224, "step": 2010 }, { "epoch": 0.5324196099103848, "grad_norm": 124.46723447990148, "learning_rate": 8.669609910384817e-07, "logits/chosen": 0.835278332233429, "logits/rejected": 0.75341796875, "logps/chosen": -399.3999938964844, "logps/rejected": -355.0, "loss": 0.5978, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.979644775390625, "rewards/margins": 0.9771728515625, "rewards/rejected": 0.00201416015625, "step": 2020 }, { "epoch": 0.5350553505535055, "grad_norm": 135.16678257433523, "learning_rate": 8.663020558777016e-07, "logits/chosen": 0.9014648199081421, "logits/rejected": 0.838330090045929, "logps/chosen": -397.54998779296875, "logps/rejected": -368.29998779296875, "loss": 0.5721, "rewards/accuracies": 0.6875, "rewards/chosen": 0.980731189250946, "rewards/margins": 0.974438488483429, "rewards/rejected": 0.005908203311264515, "step": 2030 }, { "epoch": 0.5376910911966263, "grad_norm": 113.63135888762437, "learning_rate": 8.656431207169214e-07, "logits/chosen": 0.8919922113418579, "logits/rejected": 0.7607421875, "logps/chosen": -349.0, "logps/rejected": -314.45001220703125, "loss": 0.584, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.8119140863418579, "rewards/margins": 1.038305640220642, "rewards/rejected": -0.22667236626148224, "step": 2040 }, { "epoch": 0.540326831839747, "grad_norm": 133.8540505479661, "learning_rate": 8.649841855561413e-07, "logits/chosen": 0.9002929925918579, "logits/rejected": 0.8140624761581421, "logps/chosen": -353.3999938964844, "logps/rejected": -370.1000061035156, "loss": 0.7079, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.548815906047821, "rewards/margins": 0.58740234375, "rewards/rejected": -0.03879394382238388, "step": 2050 }, { "epoch": 0.5429625724828677, "grad_norm": 173.57960271554597, "learning_rate": 8.64325250395361e-07, "logits/chosen": 0.9869140386581421, "logits/rejected": 0.788037121295929, "logps/chosen": -364.8500061035156, "logps/rejected": -316.8500061035156, "loss": 0.5413, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.0488770008087158, "rewards/margins": 1.162329077720642, "rewards/rejected": -0.11330566555261612, "step": 2060 }, { "epoch": 0.5455983131259884, "grad_norm": 97.23010016141616, "learning_rate": 8.636663152345809e-07, "logits/chosen": 0.9867187738418579, "logits/rejected": 0.9556640386581421, "logps/chosen": -329.29998779296875, "logps/rejected": -344.45001220703125, "loss": 0.5572, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8464599847793579, "rewards/margins": 0.9805663824081421, "rewards/rejected": -0.13408812880516052, "step": 2070 }, { "epoch": 0.5482340537691092, "grad_norm": 85.25467971733836, "learning_rate": 8.630073800738007e-07, "logits/chosen": 1.101171851158142, "logits/rejected": 0.944042980670929, "logps/chosen": -390.0, "logps/rejected": -369.8999938964844, "loss": 0.5492, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.987109363079071, "rewards/margins": 1.1184570789337158, "rewards/rejected": -0.13010254502296448, "step": 2080 }, { "epoch": 0.5508697944122298, "grad_norm": 162.11988091982116, "learning_rate": 8.623484449130205e-07, "logits/chosen": 0.8863281011581421, "logits/rejected": 0.921142578125, "logps/chosen": -347.6000061035156, "logps/rejected": -367.8999938964844, "loss": 0.7279, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9363769292831421, "rewards/margins": 0.686755359172821, "rewards/rejected": 0.25114744901657104, "step": 2090 }, { "epoch": 0.5535055350553506, "grad_norm": 94.15178891880907, "learning_rate": 8.616895097522403e-07, "logits/chosen": 0.9364013671875, "logits/rejected": 0.828320324420929, "logps/chosen": -374.1499938964844, "logps/rejected": -362.70001220703125, "loss": 0.5882, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9170898199081421, "rewards/margins": 0.9754394292831421, "rewards/rejected": -0.05858306959271431, "step": 2100 }, { "epoch": 0.5561412756984713, "grad_norm": 91.30331191415871, "learning_rate": 8.610305745914601e-07, "logits/chosen": 0.954907238483429, "logits/rejected": 1.00537109375, "logps/chosen": -402.25, "logps/rejected": -392.1000061035156, "loss": 0.5263, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.007959008216858, "rewards/margins": 1.246435523033142, "rewards/rejected": -0.23804931342601776, "step": 2110 }, { "epoch": 0.5587770163415919, "grad_norm": 119.53130305601019, "learning_rate": 8.6037163943068e-07, "logits/chosen": 0.981640636920929, "logits/rejected": 0.9312499761581421, "logps/chosen": -324.1499938964844, "logps/rejected": -299.1000061035156, "loss": 0.5885, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.5989013910293579, "rewards/margins": 0.8619384765625, "rewards/rejected": -0.26325684785842896, "step": 2120 }, { "epoch": 0.5614127569847127, "grad_norm": 92.5134004616522, "learning_rate": 8.597127042698999e-07, "logits/chosen": 0.8399413824081421, "logits/rejected": 0.85009765625, "logps/chosen": -375.70001220703125, "logps/rejected": -312.95001220703125, "loss": 0.5898, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.673828125, "rewards/margins": 0.9403076171875, "rewards/rejected": -0.2657714784145355, "step": 2130 }, { "epoch": 0.5640484976278334, "grad_norm": 108.15026918916944, "learning_rate": 8.590537691091196e-07, "logits/chosen": 0.925000011920929, "logits/rejected": 0.87255859375, "logps/chosen": -303.70001220703125, "logps/rejected": -338.8500061035156, "loss": 0.5046, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.557373046875, "rewards/margins": 1.3761718273162842, "rewards/rejected": -0.8194214105606079, "step": 2140 }, { "epoch": 0.5666842382709542, "grad_norm": 126.37478978456677, "learning_rate": 8.583948339483395e-07, "logits/chosen": 0.7840820550918579, "logits/rejected": 0.6627441644668579, "logps/chosen": -358.25, "logps/rejected": -310.70001220703125, "loss": 0.5773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12469482421875, "rewards/margins": 0.985595703125, "rewards/rejected": -0.860546886920929, "step": 2150 }, { "epoch": 0.5693199789140748, "grad_norm": 120.20627883367696, "learning_rate": 8.577358987875592e-07, "logits/chosen": 0.8458496332168579, "logits/rejected": 0.802929699420929, "logps/chosen": -366.95001220703125, "logps/rejected": -355.3500061035156, "loss": 0.5595, "rewards/accuracies": 0.71875, "rewards/chosen": 0.209503173828125, "rewards/margins": 1.0751953125, "rewards/rejected": -0.864697277545929, "step": 2160 }, { "epoch": 0.5719557195571956, "grad_norm": 100.84195224424599, "learning_rate": 8.570769636267791e-07, "logits/chosen": 0.807812511920929, "logits/rejected": 0.878222644329071, "logps/chosen": -347.1499938964844, "logps/rejected": -342.0, "loss": 0.6167, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.15473632514476776, "rewards/margins": 0.9566284418106079, "rewards/rejected": -0.801953136920929, "step": 2170 }, { "epoch": 0.5745914602003163, "grad_norm": 111.37409123614518, "learning_rate": 8.564180284659989e-07, "logits/chosen": 0.8306640386581421, "logits/rejected": 0.796093761920929, "logps/chosen": -358.54998779296875, "logps/rejected": -324.5, "loss": 0.5678, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.40043944120407104, "rewards/margins": 1.0399658679962158, "rewards/rejected": -0.6394622921943665, "step": 2180 }, { "epoch": 0.577227200843437, "grad_norm": 151.5289779616424, "learning_rate": 8.557590933052188e-07, "logits/chosen": 0.86865234375, "logits/rejected": 0.815185546875, "logps/chosen": -334.95001220703125, "logps/rejected": -314.70001220703125, "loss": 0.7059, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.374969482421875, "rewards/margins": 0.7479003667831421, "rewards/rejected": -0.37236326932907104, "step": 2190 }, { "epoch": 0.5798629414865577, "grad_norm": 132.22836293776808, "learning_rate": 8.551001581444386e-07, "logits/chosen": 0.7979491949081421, "logits/rejected": 0.8109375238418579, "logps/chosen": -361.20001220703125, "logps/rejected": -359.29998779296875, "loss": 0.5324, "rewards/accuracies": 0.71875, "rewards/chosen": 0.598388671875, "rewards/margins": 1.140527367591858, "rewards/rejected": -0.5426269769668579, "step": 2200 }, { "epoch": 0.5824986821296785, "grad_norm": 66.62520161164807, "learning_rate": 8.544412229836584e-07, "logits/chosen": 0.879589855670929, "logits/rejected": 0.723925769329071, "logps/chosen": -358.1499938964844, "logps/rejected": -340.25, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": 0.5972900390625, "rewards/margins": 1.350488305091858, "rewards/rejected": -0.754833996295929, "step": 2210 }, { "epoch": 0.5851344227727991, "grad_norm": 116.32064325783625, "learning_rate": 8.537822878228782e-07, "logits/chosen": 0.8563476800918579, "logits/rejected": 0.7557617425918579, "logps/chosen": -385.79998779296875, "logps/rejected": -352.6499938964844, "loss": 0.5331, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.17641600966453552, "rewards/margins": 1.1685059070587158, "rewards/rejected": -0.991961658000946, "step": 2220 }, { "epoch": 0.5877701634159199, "grad_norm": 92.69014287554712, "learning_rate": 8.53123352662098e-07, "logits/chosen": 0.7899414300918579, "logits/rejected": 0.814648449420929, "logps/chosen": -364.1499938964844, "logps/rejected": -329.1499938964844, "loss": 0.6144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1484375, "rewards/margins": 1.0140869617462158, "rewards/rejected": -1.162988305091858, "step": 2230 }, { "epoch": 0.5904059040590406, "grad_norm": 107.86896343366291, "learning_rate": 8.524644175013178e-07, "logits/chosen": 0.748046875, "logits/rejected": 0.8519531488418579, "logps/chosen": -412.70001220703125, "logps/rejected": -364.8500061035156, "loss": 0.5611, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.26556396484375, "rewards/margins": 1.188928246498108, "rewards/rejected": -0.9232422113418579, "step": 2240 }, { "epoch": 0.5930416447021613, "grad_norm": 120.19477804823983, "learning_rate": 8.518054823405377e-07, "logits/chosen": 1.0392577648162842, "logits/rejected": 1.052734375, "logps/chosen": -351.8500061035156, "logps/rejected": -328.6000061035156, "loss": 0.5346, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.605761706829071, "rewards/margins": 0.997851550579071, "rewards/rejected": -0.391448974609375, "step": 2250 }, { "epoch": 0.595677385345282, "grad_norm": 127.05769820797376, "learning_rate": 8.511465471797574e-07, "logits/chosen": 0.7806152105331421, "logits/rejected": 0.7120727300643921, "logps/chosen": -394.6000061035156, "logps/rejected": -364.29998779296875, "loss": 0.5915, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.771191418170929, "rewards/margins": 1.020532250404358, "rewards/rejected": -0.24806518852710724, "step": 2260 }, { "epoch": 0.5983131259884027, "grad_norm": 71.95295700167496, "learning_rate": 8.504876120189774e-07, "logits/chosen": 0.822802722454071, "logits/rejected": 0.9352051019668579, "logps/chosen": -382.6499938964844, "logps/rejected": -350.1499938964844, "loss": 0.5259, "rewards/accuracies": 0.75, "rewards/chosen": 0.5700439214706421, "rewards/margins": 1.190820336341858, "rewards/rejected": -0.6204833984375, "step": 2270 }, { "epoch": 0.6009488666315235, "grad_norm": 88.32750204744693, "learning_rate": 8.498286768581971e-07, "logits/chosen": 0.7880859375, "logits/rejected": 0.6934875249862671, "logps/chosen": -365.6499938964844, "logps/rejected": -340.6499938964844, "loss": 0.621, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.43889158964157104, "rewards/margins": 1.039086937904358, "rewards/rejected": -0.6003662347793579, "step": 2280 }, { "epoch": 0.6035846072746441, "grad_norm": 110.92733002080388, "learning_rate": 8.49169741697417e-07, "logits/chosen": 0.874218761920929, "logits/rejected": 0.819531261920929, "logps/chosen": -399.70001220703125, "logps/rejected": -343.04998779296875, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": 0.5514892339706421, "rewards/margins": 1.44677734375, "rewards/rejected": -0.895050048828125, "step": 2290 }, { "epoch": 0.6062203479177649, "grad_norm": 99.90440260416403, "learning_rate": 8.485108065366368e-07, "logits/chosen": 0.852343738079071, "logits/rejected": 0.8634277582168579, "logps/chosen": -414.20001220703125, "logps/rejected": -351.3500061035156, "loss": 0.5695, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7327880859375, "rewards/margins": 1.075292944908142, "rewards/rejected": -0.3416503965854645, "step": 2300 }, { "epoch": 0.6088560885608856, "grad_norm": 100.55989560489931, "learning_rate": 8.478518713758565e-07, "logits/chosen": 0.840624988079071, "logits/rejected": 0.8241211175918579, "logps/chosen": -353.75, "logps/rejected": -336.79998779296875, "loss": 0.6117, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6196533441543579, "rewards/margins": 0.8164306879043579, "rewards/rejected": -0.19636841118335724, "step": 2310 }, { "epoch": 0.6114918292040064, "grad_norm": 87.73748502401234, "learning_rate": 8.471929362150764e-07, "logits/chosen": 1.0133788585662842, "logits/rejected": 1.0402343273162842, "logps/chosen": -374.79998779296875, "logps/rejected": -371.6499938964844, "loss": 0.5882, "rewards/accuracies": 0.65625, "rewards/chosen": 0.39497071504592896, "rewards/margins": 1.1228516101837158, "rewards/rejected": -0.727246105670929, "step": 2320 }, { "epoch": 0.614127569847127, "grad_norm": 121.18351496430915, "learning_rate": 8.465340010542961e-07, "logits/chosen": 0.915722668170929, "logits/rejected": 0.709912121295929, "logps/chosen": -383.29998779296875, "logps/rejected": -358.75, "loss": 0.6047, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19476929306983948, "rewards/margins": 1.14178466796875, "rewards/rejected": -0.94671630859375, "step": 2330 }, { "epoch": 0.6167633104902478, "grad_norm": 66.82525512811492, "learning_rate": 8.458750658935161e-07, "logits/chosen": 0.8724120855331421, "logits/rejected": 0.8249756097793579, "logps/chosen": -366.625, "logps/rejected": -359.3374938964844, "loss": 0.6669, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4858764708042145, "rewards/margins": 1.015753149986267, "rewards/rejected": -0.52972412109375, "step": 2340 }, { "epoch": 0.6193990511333685, "grad_norm": 69.41475529500612, "learning_rate": 8.45216130732736e-07, "logits/chosen": 0.944042980670929, "logits/rejected": 0.817138671875, "logps/chosen": -384.6499938964844, "logps/rejected": -370.42498779296875, "loss": 0.5396, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.6686767339706421, "rewards/margins": 1.256201148033142, "rewards/rejected": -0.5883544683456421, "step": 2350 }, { "epoch": 0.6220347917764892, "grad_norm": 100.32436050275324, "learning_rate": 8.445571955719557e-07, "logits/chosen": 0.836621105670929, "logits/rejected": 0.7987304925918579, "logps/chosen": -369.1499938964844, "logps/rejected": -324.25, "loss": 0.5288, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.865917980670929, "rewards/margins": 1.352685570716858, "rewards/rejected": -0.4856323301792145, "step": 2360 }, { "epoch": 0.6246705324196099, "grad_norm": 91.45181814026573, "learning_rate": 8.438982604111755e-07, "logits/chosen": 0.9532470703125, "logits/rejected": 0.9115234613418579, "logps/chosen": -346.20001220703125, "logps/rejected": -322.79998779296875, "loss": 0.4799, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.75360107421875, "rewards/margins": 1.326269507408142, "rewards/rejected": -0.5716797113418579, "step": 2370 }, { "epoch": 0.6273062730627307, "grad_norm": 77.2834949423477, "learning_rate": 8.432393252503953e-07, "logits/chosen": 0.9081054925918579, "logits/rejected": 0.9273437261581421, "logps/chosen": -362.1499938964844, "logps/rejected": -352.0, "loss": 0.5293, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.9027343988418579, "rewards/margins": 1.2497069835662842, "rewards/rejected": -0.3454956114292145, "step": 2380 }, { "epoch": 0.6299420137058513, "grad_norm": 97.78009650625408, "learning_rate": 8.425803900896151e-07, "logits/chosen": 0.755810558795929, "logits/rejected": 0.7772461175918579, "logps/chosen": -347.8500061035156, "logps/rejected": -346.79998779296875, "loss": 0.4887, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8187500238418579, "rewards/margins": 1.269287109375, "rewards/rejected": -0.4504150450229645, "step": 2390 }, { "epoch": 0.632577754348972, "grad_norm": 113.13579060883261, "learning_rate": 8.419214549288349e-07, "logits/chosen": 0.705273449420929, "logits/rejected": 0.686083972454071, "logps/chosen": -428.3500061035156, "logps/rejected": -381.70001220703125, "loss": 0.5421, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6449218988418579, "rewards/margins": 1.27099609375, "rewards/rejected": -0.626844048500061, "step": 2400 }, { "epoch": 0.6352134949920928, "grad_norm": 146.584464979973, "learning_rate": 8.412625197680547e-07, "logits/chosen": 0.5204132199287415, "logits/rejected": 0.49213868379592896, "logps/chosen": -392.32501220703125, "logps/rejected": -364.6499938964844, "loss": 0.6522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.15739746391773224, "rewards/margins": 1.0338623523712158, "rewards/rejected": -0.877062976360321, "step": 2410 }, { "epoch": 0.6378492356352135, "grad_norm": 83.35918576894538, "learning_rate": 8.406035846072747e-07, "logits/chosen": 0.7005859613418579, "logits/rejected": 0.589160144329071, "logps/chosen": -426.1000061035156, "logps/rejected": -406.75, "loss": 0.5292, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.664599597454071, "rewards/margins": 1.299902319908142, "rewards/rejected": -0.635974109172821, "step": 2420 }, { "epoch": 0.6404849762783342, "grad_norm": 95.85480718866994, "learning_rate": 8.399446494464944e-07, "logits/chosen": 0.788623034954071, "logits/rejected": 0.7386718988418579, "logps/chosen": -385.54998779296875, "logps/rejected": -368.20001220703125, "loss": 0.586, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5651763677597046, "rewards/margins": 1.189062476158142, "rewards/rejected": -0.623608410358429, "step": 2430 }, { "epoch": 0.6431207169214549, "grad_norm": 77.40222029498399, "learning_rate": 8.392857142857143e-07, "logits/chosen": 0.6321777105331421, "logits/rejected": 0.6460937261581421, "logps/chosen": -343.8500061035156, "logps/rejected": -359.25, "loss": 0.5726, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7239745855331421, "rewards/margins": 1.0601074695587158, "rewards/rejected": -0.33566588163375854, "step": 2440 }, { "epoch": 0.6457564575645757, "grad_norm": 82.10571623866431, "learning_rate": 8.38626779124934e-07, "logits/chosen": 0.720507800579071, "logits/rejected": 0.604785144329071, "logps/chosen": -412.3500061035156, "logps/rejected": -387.29998779296875, "loss": 0.5503, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.44157105684280396, "rewards/margins": 1.2356445789337158, "rewards/rejected": -0.7944091558456421, "step": 2450 }, { "epoch": 0.6483921982076963, "grad_norm": 101.6076253868431, "learning_rate": 8.379678439641539e-07, "logits/chosen": 0.508984386920929, "logits/rejected": 0.4714721739292145, "logps/chosen": -363.3999938964844, "logps/rejected": -327.95001220703125, "loss": 0.5759, "rewards/accuracies": 0.6875, "rewards/chosen": 0.31303101778030396, "rewards/margins": 1.1054198741912842, "rewards/rejected": -0.7924560308456421, "step": 2460 }, { "epoch": 0.6510279388508171, "grad_norm": 170.4318267062769, "learning_rate": 8.373089088033737e-07, "logits/chosen": 0.7427612543106079, "logits/rejected": 0.770800769329071, "logps/chosen": -369.95001220703125, "logps/rejected": -356.8999938964844, "loss": 0.6169, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.634570300579071, "rewards/margins": 0.976489245891571, "rewards/rejected": -0.3422607481479645, "step": 2470 }, { "epoch": 0.6536636794939378, "grad_norm": 166.68056310061476, "learning_rate": 8.366499736425935e-07, "logits/chosen": 0.7320922613143921, "logits/rejected": 0.698413074016571, "logps/chosen": -369.95001220703125, "logps/rejected": -362.79998779296875, "loss": 0.7175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6111816167831421, "rewards/margins": 0.840222179889679, "rewards/rejected": -0.22830811142921448, "step": 2480 }, { "epoch": 0.6562994201370586, "grad_norm": 125.40069622713574, "learning_rate": 8.359910384818134e-07, "logits/chosen": 0.8773437738418579, "logits/rejected": 0.6858886480331421, "logps/chosen": -398.0, "logps/rejected": -349.8999938964844, "loss": 0.5925, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0015137195587158, "rewards/margins": 1.1121094226837158, "rewards/rejected": -0.11077880859375, "step": 2490 }, { "epoch": 0.6589351607801792, "grad_norm": 80.54389654354934, "learning_rate": 8.353321033210332e-07, "logits/chosen": 0.9619140625, "logits/rejected": 0.877148449420929, "logps/chosen": -357.1499938964844, "logps/rejected": -329.1000061035156, "loss": 0.5046, "rewards/accuracies": 0.71875, "rewards/chosen": 1.218359351158142, "rewards/margins": 1.223657250404358, "rewards/rejected": -0.0054931640625, "step": 2500 }, { "epoch": 0.6615709014233, "grad_norm": 122.10213818113165, "learning_rate": 8.34673168160253e-07, "logits/chosen": 0.929003894329071, "logits/rejected": 0.8316406011581421, "logps/chosen": -348.3999938964844, "logps/rejected": -324.8999938964844, "loss": 0.5671, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.117773413658142, "rewards/margins": 1.173681616783142, "rewards/rejected": -0.05561523512005806, "step": 2510 }, { "epoch": 0.6642066420664207, "grad_norm": 133.75466056446322, "learning_rate": 8.340142329994729e-07, "logits/chosen": 0.884570300579071, "logits/rejected": 0.858935534954071, "logps/chosen": -339.3500061035156, "logps/rejected": -311.20001220703125, "loss": 0.5802, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.0185546875, "rewards/margins": 1.045068383216858, "rewards/rejected": -0.02672424353659153, "step": 2520 }, { "epoch": 0.6668423827095413, "grad_norm": 115.34941496401763, "learning_rate": 8.333552978386926e-07, "logits/chosen": 0.8546386957168579, "logits/rejected": 0.8434082269668579, "logps/chosen": -331.6000061035156, "logps/rejected": -301.6000061035156, "loss": 0.5824, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.116845726966858, "rewards/margins": 1.060546875, "rewards/rejected": 0.05497436597943306, "step": 2530 }, { "epoch": 0.6694781233526621, "grad_norm": 68.10459895587118, "learning_rate": 8.326963626779125e-07, "logits/chosen": 0.9183593988418579, "logits/rejected": 0.9896484613418579, "logps/chosen": -306.8500061035156, "logps/rejected": -300.8999938964844, "loss": 0.5394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.121069312095642, "rewards/margins": 1.130957007408142, "rewards/rejected": -0.011547851376235485, "step": 2540 }, { "epoch": 0.6721138639957828, "grad_norm": 119.79188647480876, "learning_rate": 8.320374275171322e-07, "logits/chosen": 0.899609386920929, "logits/rejected": 0.8109375238418579, "logps/chosen": -373.20001220703125, "logps/rejected": -364.6000061035156, "loss": 0.5644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.3185546398162842, "rewards/margins": 1.1931641101837158, "rewards/rejected": 0.12547607719898224, "step": 2550 }, { "epoch": 0.6747496046389035, "grad_norm": 111.53687443058234, "learning_rate": 8.313784923563521e-07, "logits/chosen": 0.94921875, "logits/rejected": 0.8486328125, "logps/chosen": -353.1499938964844, "logps/rejected": -309.95001220703125, "loss": 0.6354, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.780346691608429, "rewards/margins": 0.973828136920929, "rewards/rejected": -0.19298705458641052, "step": 2560 }, { "epoch": 0.6773853452820242, "grad_norm": 93.93335146045445, "learning_rate": 8.307195571955719e-07, "logits/chosen": 0.803515613079071, "logits/rejected": 0.7037109136581421, "logps/chosen": -363.2250061035156, "logps/rejected": -326.4750061035156, "loss": 0.4937, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.902099609375, "rewards/margins": 1.357275366783142, "rewards/rejected": -0.454345703125, "step": 2570 }, { "epoch": 0.680021085925145, "grad_norm": 108.59383985573622, "learning_rate": 8.300606220347918e-07, "logits/chosen": 0.92034912109375, "logits/rejected": 0.7542968988418579, "logps/chosen": -398.70001220703125, "logps/rejected": -331.8500061035156, "loss": 0.6097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.648364245891571, "rewards/margins": 1.0951659679412842, "rewards/rejected": -0.4476562440395355, "step": 2580 }, { "epoch": 0.6826568265682657, "grad_norm": 111.93426696650113, "learning_rate": 8.294016868740116e-07, "logits/chosen": 0.8951171636581421, "logits/rejected": 0.7027343511581421, "logps/chosen": -363.6499938964844, "logps/rejected": -322.25, "loss": 0.582, "rewards/accuracies": 0.6875, "rewards/chosen": 1.006445288658142, "rewards/margins": 1.072021484375, "rewards/rejected": -0.06547851860523224, "step": 2590 }, { "epoch": 0.6852925672113864, "grad_norm": 113.07357683128896, "learning_rate": 8.287427517132314e-07, "logits/chosen": 0.705615222454071, "logits/rejected": 0.6603027582168579, "logps/chosen": -409.1000061035156, "logps/rejected": -395.54998779296875, "loss": 0.5635, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.6102050542831421, "rewards/margins": 1.1501953601837158, "rewards/rejected": -0.53955078125, "step": 2600 }, { "epoch": 0.6879283078545071, "grad_norm": 126.3209662931668, "learning_rate": 8.280838165524512e-07, "logits/chosen": 0.7308593988418579, "logits/rejected": 0.6312255859375, "logps/chosen": -406.20001220703125, "logps/rejected": -402.45001220703125, "loss": 0.619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3689941465854645, "rewards/margins": 1.140966773033142, "rewards/rejected": -0.770739734172821, "step": 2610 }, { "epoch": 0.6905640484976279, "grad_norm": 65.01147266771906, "learning_rate": 8.27424881391671e-07, "logits/chosen": 0.7861328125, "logits/rejected": 0.6773926019668579, "logps/chosen": -363.57501220703125, "logps/rejected": -369.1000061035156, "loss": 0.6054, "rewards/accuracies": 0.71875, "rewards/chosen": 0.24039307236671448, "rewards/margins": 1.1116211414337158, "rewards/rejected": -0.8716796636581421, "step": 2620 }, { "epoch": 0.6931997891407485, "grad_norm": 134.08178839807783, "learning_rate": 8.267659462308908e-07, "logits/chosen": 0.6098434329032898, "logits/rejected": 0.655224621295929, "logps/chosen": -360.1000061035156, "logps/rejected": -363.8500061035156, "loss": 0.5874, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.367431640625, "rewards/margins": 1.140893578529358, "rewards/rejected": -0.7735351324081421, "step": 2630 }, { "epoch": 0.6958355297838693, "grad_norm": 50.70910296586905, "learning_rate": 8.261070110701108e-07, "logits/chosen": 0.684619128704071, "logits/rejected": 0.7000061273574829, "logps/chosen": -391.29998779296875, "logps/rejected": -372.79998779296875, "loss": 0.6002, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.4031738340854645, "rewards/margins": 1.144018530845642, "rewards/rejected": -0.7406250238418579, "step": 2640 }, { "epoch": 0.69847127042699, "grad_norm": 122.18051612362638, "learning_rate": 8.254480759093305e-07, "logits/chosen": 0.69427490234375, "logits/rejected": 0.616259753704071, "logps/chosen": -357.6499938964844, "logps/rejected": -348.45001220703125, "loss": 0.4915, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.32095032930374146, "rewards/margins": 1.2526366710662842, "rewards/rejected": -0.931591808795929, "step": 2650 }, { "epoch": 0.7011070110701108, "grad_norm": 141.3382467691499, "learning_rate": 8.247891407485504e-07, "logits/chosen": 0.616162121295929, "logits/rejected": 0.654101550579071, "logps/chosen": -362.8999938964844, "logps/rejected": -340.6499938964844, "loss": 0.6376, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05270995944738388, "rewards/margins": 0.9562011957168579, "rewards/rejected": -0.9030517339706421, "step": 2660 }, { "epoch": 0.7037427517132314, "grad_norm": 102.05707667429517, "learning_rate": 8.241302055877701e-07, "logits/chosen": 0.6595703363418579, "logits/rejected": 0.591552734375, "logps/chosen": -362.8999938964844, "logps/rejected": -342.6499938964844, "loss": 0.5795, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.31989747285842896, "rewards/margins": 1.080078125, "rewards/rejected": -0.7598358392715454, "step": 2670 }, { "epoch": 0.7063784923563521, "grad_norm": 68.71232831662877, "learning_rate": 8.234712704269899e-07, "logits/chosen": 0.85498046875, "logits/rejected": 0.8817383050918579, "logps/chosen": -337.79998779296875, "logps/rejected": -309.95001220703125, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6411987543106079, "rewards/margins": 1.221289038658142, "rewards/rejected": -0.578747570514679, "step": 2680 }, { "epoch": 0.7090142329994729, "grad_norm": 174.86726590722515, "learning_rate": 8.228123352662098e-07, "logits/chosen": 0.797607421875, "logits/rejected": 0.7728515863418579, "logps/chosen": -354.79998779296875, "logps/rejected": -352.75, "loss": 0.6184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.739611804485321, "rewards/margins": 0.990185558795929, "rewards/rejected": -0.2506042420864105, "step": 2690 }, { "epoch": 0.7116499736425935, "grad_norm": 109.53027140876576, "learning_rate": 8.221534001054295e-07, "logits/chosen": 0.8065429925918579, "logits/rejected": 0.744061291217804, "logps/chosen": -376.8999938964844, "logps/rejected": -337.75, "loss": 0.6749, "rewards/accuracies": 0.6875, "rewards/chosen": 0.761474609375, "rewards/margins": 1.05517578125, "rewards/rejected": -0.2929931581020355, "step": 2700 }, { "epoch": 0.7142857142857143, "grad_norm": 93.20307739844311, "learning_rate": 8.214944649446494e-07, "logits/chosen": 0.841015636920929, "logits/rejected": 0.86181640625, "logps/chosen": -393.1499938964844, "logps/rejected": -358.5, "loss": 0.5849, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7152465581893921, "rewards/margins": 1.2301757335662842, "rewards/rejected": -0.5140625238418579, "step": 2710 }, { "epoch": 0.716921454928835, "grad_norm": 94.02050197231564, "learning_rate": 8.208355297838692e-07, "logits/chosen": 0.793212890625, "logits/rejected": 0.732226550579071, "logps/chosen": -382.25, "logps/rejected": -347.75, "loss": 0.5167, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8208252191543579, "rewards/margins": 1.1931641101837158, "rewards/rejected": -0.3719726502895355, "step": 2720 }, { "epoch": 0.7195571955719557, "grad_norm": 125.81758082001325, "learning_rate": 8.201765946230891e-07, "logits/chosen": 0.6712890863418579, "logits/rejected": 0.7386718988418579, "logps/chosen": -377.1499938964844, "logps/rejected": -340.79998779296875, "loss": 0.6396, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.23396606743335724, "rewards/margins": 0.947436511516571, "rewards/rejected": -0.712353527545929, "step": 2730 }, { "epoch": 0.7221929362150764, "grad_norm": 115.02970140997901, "learning_rate": 8.195176594623088e-07, "logits/chosen": 0.666210949420929, "logits/rejected": 0.5938965082168579, "logps/chosen": -378.3999938964844, "logps/rejected": -342.25, "loss": 0.584, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47820740938186646, "rewards/margins": 1.00732421875, "rewards/rejected": -0.5287109613418579, "step": 2740 }, { "epoch": 0.7248286768581972, "grad_norm": 131.37499690307206, "learning_rate": 8.188587243015287e-07, "logits/chosen": 0.921679675579071, "logits/rejected": 0.90869140625, "logps/chosen": -393.25, "logps/rejected": -369.6000061035156, "loss": 0.6756, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06670532375574112, "rewards/margins": 0.8362792730331421, "rewards/rejected": -0.7680908441543579, "step": 2750 }, { "epoch": 0.7274644175013179, "grad_norm": 160.5755045554733, "learning_rate": 8.181997891407485e-07, "logits/chosen": 0.7984374761581421, "logits/rejected": 0.775097668170929, "logps/chosen": -316.92498779296875, "logps/rejected": -355.3999938964844, "loss": 0.5944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5450073480606079, "rewards/margins": 1.004541039466858, "rewards/rejected": -0.45997315645217896, "step": 2760 }, { "epoch": 0.7301001581444386, "grad_norm": 144.83592184842917, "learning_rate": 8.175408539799683e-07, "logits/chosen": 0.894946277141571, "logits/rejected": 0.8662109375, "logps/chosen": -302.6499938964844, "logps/rejected": -310.8500061035156, "loss": 0.5427, "rewards/accuracies": 0.75, "rewards/chosen": 0.649768054485321, "rewards/margins": 1.1798827648162842, "rewards/rejected": -0.52838134765625, "step": 2770 }, { "epoch": 0.7327358987875593, "grad_norm": 113.43961525210419, "learning_rate": 8.168819188191881e-07, "logits/chosen": 0.884472668170929, "logits/rejected": 0.9312988519668579, "logps/chosen": -318.54998779296875, "logps/rejected": -324.8500061035156, "loss": 0.597, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.48057860136032104, "rewards/margins": 0.810009777545929, "rewards/rejected": -0.3287719786167145, "step": 2780 }, { "epoch": 0.7353716394306801, "grad_norm": 95.50895996469366, "learning_rate": 8.16222983658408e-07, "logits/chosen": 0.907031238079071, "logits/rejected": 0.997753918170929, "logps/chosen": -343.1499938964844, "logps/rejected": -324.3999938964844, "loss": 0.7135, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.60595703125, "rewards/margins": 0.675048828125, "rewards/rejected": -0.06901855766773224, "step": 2790 }, { "epoch": 0.7380073800738007, "grad_norm": 131.97689602887738, "learning_rate": 8.155640484976278e-07, "logits/chosen": 0.9647461175918579, "logits/rejected": 0.962695300579071, "logps/chosen": -328.75, "logps/rejected": -320.1000061035156, "loss": 0.6402, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.785107433795929, "rewards/margins": 0.7806396484375, "rewards/rejected": 0.004504394717514515, "step": 2800 }, { "epoch": 0.7406431207169215, "grad_norm": 202.91434924646506, "learning_rate": 8.149051133368477e-07, "logits/chosen": 0.923828125, "logits/rejected": 0.86376953125, "logps/chosen": -389.70001220703125, "logps/rejected": -331.20001220703125, "loss": 0.6138, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.850292980670929, "rewards/margins": 1.031957983970642, "rewards/rejected": -0.18153075873851776, "step": 2810 }, { "epoch": 0.7432788613600422, "grad_norm": 110.00262628259107, "learning_rate": 8.142461781760674e-07, "logits/chosen": 1.009765625, "logits/rejected": 0.9896484613418579, "logps/chosen": -362.29998779296875, "logps/rejected": -320.3500061035156, "loss": 0.5487, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.990527331829071, "rewards/margins": 0.9942626953125, "rewards/rejected": -0.0033691406715661287, "step": 2820 }, { "epoch": 0.7459146020031628, "grad_norm": 152.09806744457282, "learning_rate": 8.135872430152873e-07, "logits/chosen": 0.82275390625, "logits/rejected": 0.8184570074081421, "logps/chosen": -376.8999938964844, "logps/rejected": -351.95001220703125, "loss": 0.6563, "rewards/accuracies": 0.65625, "rewards/chosen": 0.75439453125, "rewards/margins": 0.9295409917831421, "rewards/rejected": -0.17409667372703552, "step": 2830 }, { "epoch": 0.7485503426462836, "grad_norm": 119.17122711537617, "learning_rate": 8.12928307854507e-07, "logits/chosen": 0.8780273199081421, "logits/rejected": 0.8407226800918579, "logps/chosen": -345.1499938964844, "logps/rejected": -351.57501220703125, "loss": 0.5848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.963330090045929, "rewards/margins": 1.110986351966858, "rewards/rejected": -0.14628906548023224, "step": 2840 }, { "epoch": 0.7511860832894043, "grad_norm": 143.46156647571584, "learning_rate": 8.122693726937269e-07, "logits/chosen": 0.8163086175918579, "logits/rejected": 0.81005859375, "logps/chosen": -367.45001220703125, "logps/rejected": -358.6499938964844, "loss": 0.5331, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.042382836341858, "rewards/margins": 1.2057616710662842, "rewards/rejected": -0.16273193061351776, "step": 2850 }, { "epoch": 0.7538218239325251, "grad_norm": 65.72175953232879, "learning_rate": 8.116104375329466e-07, "logits/chosen": 1.038476586341858, "logits/rejected": 0.9604126214981079, "logps/chosen": -342.1499938964844, "logps/rejected": -316.95001220703125, "loss": 0.6191, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8956543207168579, "rewards/margins": 0.926953136920929, "rewards/rejected": -0.03161010891199112, "step": 2860 }, { "epoch": 0.7564575645756457, "grad_norm": 160.98245012290636, "learning_rate": 8.109515023721666e-07, "logits/chosen": 0.860058605670929, "logits/rejected": 0.868969738483429, "logps/chosen": -354.6499938964844, "logps/rejected": -342.5, "loss": 0.7372, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.122216820716858, "rewards/margins": 0.748767077922821, "rewards/rejected": 0.37443238496780396, "step": 2870 }, { "epoch": 0.7590933052187665, "grad_norm": 136.63234385258298, "learning_rate": 8.102925672113864e-07, "logits/chosen": 0.877636730670929, "logits/rejected": 0.8782714605331421, "logps/chosen": -346.70001220703125, "logps/rejected": -308.7250061035156, "loss": 0.6357, "rewards/accuracies": 0.65625, "rewards/chosen": 0.802929699420929, "rewards/margins": 0.854754626750946, "rewards/rejected": -0.05159912258386612, "step": 2880 }, { "epoch": 0.7617290458618872, "grad_norm": 137.48489941392495, "learning_rate": 8.096336320506062e-07, "logits/chosen": 0.8663085699081421, "logits/rejected": 0.869824230670929, "logps/chosen": -378.54998779296875, "logps/rejected": -340.25, "loss": 0.5205, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.9712890386581421, "rewards/margins": 1.122827172279358, "rewards/rejected": -0.150665283203125, "step": 2890 }, { "epoch": 0.7643647865050079, "grad_norm": 150.4731729369096, "learning_rate": 8.08974696889826e-07, "logits/chosen": 0.79638671875, "logits/rejected": 0.773632824420929, "logps/chosen": -355.5, "logps/rejected": -345.79998779296875, "loss": 0.8725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.860644519329071, "rewards/margins": 0.7517334222793579, "rewards/rejected": 0.10821533203125, "step": 2900 }, { "epoch": 0.7670005271481286, "grad_norm": 103.32018017270269, "learning_rate": 8.083157617290458e-07, "logits/chosen": 1.0173828601837158, "logits/rejected": 0.9720703363418579, "logps/chosen": -367.6499938964844, "logps/rejected": -372.5, "loss": 0.5164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9446777105331421, "rewards/margins": 1.1155273914337158, "rewards/rejected": -0.1705322265625, "step": 2910 }, { "epoch": 0.7696362677912494, "grad_norm": 96.78975479647495, "learning_rate": 8.076568265682656e-07, "logits/chosen": 0.9759765863418579, "logits/rejected": 0.9473632574081421, "logps/chosen": -307.0, "logps/rejected": -304.04998779296875, "loss": 0.5369, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.10205078125, "rewards/margins": 1.0117919445037842, "rewards/rejected": 0.08997802436351776, "step": 2920 }, { "epoch": 0.77227200843437, "grad_norm": 93.64467091659061, "learning_rate": 8.069978914074855e-07, "logits/chosen": 0.880786120891571, "logits/rejected": 0.7593749761581421, "logps/chosen": -379.1000061035156, "logps/rejected": -356.79998779296875, "loss": 0.546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.085595726966858, "rewards/margins": 1.2314941883087158, "rewards/rejected": -0.14760741591453552, "step": 2930 }, { "epoch": 0.7749077490774908, "grad_norm": 86.36659453655005, "learning_rate": 8.063389562467053e-07, "logits/chosen": 0.981249988079071, "logits/rejected": 0.9688476324081421, "logps/chosen": -347.6499938964844, "logps/rejected": -328.75, "loss": 0.6004, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7318481206893921, "rewards/margins": 1.0314452648162842, "rewards/rejected": -0.2992797791957855, "step": 2940 }, { "epoch": 0.7775434897206115, "grad_norm": 149.89037597762757, "learning_rate": 8.056800210859252e-07, "logits/chosen": 0.9886718988418579, "logits/rejected": 0.887402355670929, "logps/chosen": -337.29998779296875, "logps/rejected": -357.75, "loss": 0.5631, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.666473388671875, "rewards/margins": 1.3597900867462158, "rewards/rejected": -0.6929687261581421, "step": 2950 }, { "epoch": 0.7801792303637322, "grad_norm": 136.81197817025358, "learning_rate": 8.050210859251449e-07, "logits/chosen": 0.912109375, "logits/rejected": 0.784472644329071, "logps/chosen": -376.25, "logps/rejected": -352.20001220703125, "loss": 0.5667, "rewards/accuracies": 0.75, "rewards/chosen": 0.601025402545929, "rewards/margins": 1.3126952648162842, "rewards/rejected": -0.7115234136581421, "step": 2960 }, { "epoch": 0.7828149710068529, "grad_norm": 106.49376156629552, "learning_rate": 8.043621507643648e-07, "logits/chosen": 0.772265613079071, "logits/rejected": 0.748095691204071, "logps/chosen": -363.79998779296875, "logps/rejected": -349.75, "loss": 0.602, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.42352294921875, "rewards/margins": 1.1947753429412842, "rewards/rejected": -0.771313488483429, "step": 2970 }, { "epoch": 0.7854507116499736, "grad_norm": 128.13429510549747, "learning_rate": 8.037032156035846e-07, "logits/chosen": 0.7884765863418579, "logits/rejected": 0.80859375, "logps/chosen": -352.29998779296875, "logps/rejected": -351.45001220703125, "loss": 0.6028, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.7212158441543579, "rewards/margins": 1.1398437023162842, "rewards/rejected": -0.4181976318359375, "step": 2980 }, { "epoch": 0.7880864522930944, "grad_norm": 142.78229930851896, "learning_rate": 8.030442804428044e-07, "logits/chosen": 0.751873791217804, "logits/rejected": 0.7734375, "logps/chosen": -348.29998779296875, "logps/rejected": -373.1499938964844, "loss": 0.6409, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.772998034954071, "rewards/margins": 0.907421886920929, "rewards/rejected": -0.13530273735523224, "step": 2990 }, { "epoch": 0.790722192936215, "grad_norm": 169.4158723335224, "learning_rate": 8.023853452820242e-07, "logits/chosen": 0.864025890827179, "logits/rejected": 0.844531238079071, "logps/chosen": -321.625, "logps/rejected": -304.6000061035156, "loss": 0.7191, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.543261706829071, "rewards/margins": 0.747875988483429, "rewards/rejected": -0.204833984375, "step": 3000 }, { "epoch": 0.7933579335793358, "grad_norm": 100.60940355373636, "learning_rate": 8.017264101212439e-07, "logits/chosen": 0.8209472894668579, "logits/rejected": 0.786376953125, "logps/chosen": -340.3999938964844, "logps/rejected": -343.95001220703125, "loss": 0.502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.763781726360321, "rewards/margins": 1.2353515625, "rewards/rejected": -0.4731079041957855, "step": 3010 }, { "epoch": 0.7959936742224565, "grad_norm": 157.09250870151175, "learning_rate": 8.010674749604639e-07, "logits/chosen": 0.7608398199081421, "logits/rejected": 0.742382824420929, "logps/chosen": -375.54998779296875, "logps/rejected": -341.4750061035156, "loss": 0.5962, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.58599853515625, "rewards/margins": 0.878955066204071, "rewards/rejected": -0.29444581270217896, "step": 3020 }, { "epoch": 0.7986294148655773, "grad_norm": 149.5885794269222, "learning_rate": 8.004085397996838e-07, "logits/chosen": 0.947460949420929, "logits/rejected": 0.805224597454071, "logps/chosen": -364.95001220703125, "logps/rejected": -335.29998779296875, "loss": 0.6055, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.7046142816543579, "rewards/margins": 1.065893530845642, "rewards/rejected": -0.36125487089157104, "step": 3030 }, { "epoch": 0.8012651555086979, "grad_norm": 114.55068406102627, "learning_rate": 7.997496046389035e-07, "logits/chosen": 0.8062988519668579, "logits/rejected": 0.824511706829071, "logps/chosen": -358.8500061035156, "logps/rejected": -345.25, "loss": 0.6055, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7011474370956421, "rewards/margins": 1.01947021484375, "rewards/rejected": -0.31853026151657104, "step": 3040 }, { "epoch": 0.8039008961518187, "grad_norm": 107.1651723292309, "learning_rate": 7.990906694781233e-07, "logits/chosen": 0.8953613042831421, "logits/rejected": 0.8875976800918579, "logps/chosen": -417.29998779296875, "logps/rejected": -363.0, "loss": 0.6022, "rewards/accuracies": 0.625, "rewards/chosen": 0.814990222454071, "rewards/margins": 0.8595215082168579, "rewards/rejected": -0.04414062574505806, "step": 3050 }, { "epoch": 0.8065366367949394, "grad_norm": 126.34761048323337, "learning_rate": 7.984317343173431e-07, "logits/chosen": 0.8726562261581421, "logits/rejected": 0.911914050579071, "logps/chosen": -352.79998779296875, "logps/rejected": -321.6499938964844, "loss": 0.6432, "rewards/accuracies": 0.65625, "rewards/chosen": 0.6409667730331421, "rewards/margins": 0.7930663824081421, "rewards/rejected": -0.15300293266773224, "step": 3060 }, { "epoch": 0.8091723774380601, "grad_norm": 102.18198045662491, "learning_rate": 7.977727991565629e-07, "logits/chosen": 0.735760509967804, "logits/rejected": 0.827807605266571, "logps/chosen": -335.3999938964844, "logps/rejected": -363.29998779296875, "loss": 0.5947, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4036865234375, "rewards/margins": 1.0156738758087158, "rewards/rejected": -0.6119629144668579, "step": 3070 }, { "epoch": 0.8118081180811808, "grad_norm": 448.3841289888972, "learning_rate": 7.971138639957827e-07, "logits/chosen": 0.639697253704071, "logits/rejected": 0.5079925656318665, "logps/chosen": -338.625, "logps/rejected": -329.75, "loss": 0.6869, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19328613579273224, "rewards/margins": 0.762561023235321, "rewards/rejected": -0.5694793462753296, "step": 3080 }, { "epoch": 0.8144438587243016, "grad_norm": 385.1365391524855, "learning_rate": 7.964549288350026e-07, "logits/chosen": 0.5858398675918579, "logits/rejected": 0.505114734172821, "logps/chosen": -376.70001220703125, "logps/rejected": -367.20001220703125, "loss": 0.6211, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13399657607078552, "rewards/margins": 1.158410668373108, "rewards/rejected": -1.0241210460662842, "step": 3090 }, { "epoch": 0.8170795993674222, "grad_norm": 106.14647980371373, "learning_rate": 7.957959936742225e-07, "logits/chosen": 0.6812988519668579, "logits/rejected": 0.6571410894393921, "logps/chosen": -349.67498779296875, "logps/rejected": -348.0, "loss": 0.6084, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5534301996231079, "rewards/margins": 0.9234344363212585, "rewards/rejected": -0.3692871034145355, "step": 3100 }, { "epoch": 0.8197153400105429, "grad_norm": 125.37172460939837, "learning_rate": 7.951370585134422e-07, "logits/chosen": 0.8052734136581421, "logits/rejected": 0.823535144329071, "logps/chosen": -366.25, "logps/rejected": -366.79998779296875, "loss": 0.5254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6619873046875, "rewards/margins": 1.11669921875, "rewards/rejected": -0.4537109434604645, "step": 3110 }, { "epoch": 0.8223510806536637, "grad_norm": 151.95250485467326, "learning_rate": 7.944781233526621e-07, "logits/chosen": 0.841113269329071, "logits/rejected": 0.833691418170929, "logps/chosen": -339.07501220703125, "logps/rejected": -348.04998779296875, "loss": 0.5026, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.996722400188446, "rewards/margins": 1.2502562999725342, "rewards/rejected": -0.2535461485385895, "step": 3120 }, { "epoch": 0.8249868212967844, "grad_norm": 124.82601659298423, "learning_rate": 7.938191881918818e-07, "logits/chosen": 0.624755859375, "logits/rejected": 0.6607666015625, "logps/chosen": -359.6499938964844, "logps/rejected": -342.04998779296875, "loss": 0.5937, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9107421636581421, "rewards/margins": 1.1016113758087158, "rewards/rejected": -0.19172362983226776, "step": 3130 }, { "epoch": 0.8276225619399051, "grad_norm": 127.46353371484689, "learning_rate": 7.931602530311017e-07, "logits/chosen": 0.8099609613418579, "logits/rejected": 0.7489258050918579, "logps/chosen": -363.3999938964844, "logps/rejected": -327.95001220703125, "loss": 0.7757, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.565722644329071, "rewards/margins": 0.909375011920929, "rewards/rejected": -0.3438476622104645, "step": 3140 }, { "epoch": 0.8302583025830258, "grad_norm": 78.864026805485, "learning_rate": 7.925013178703215e-07, "logits/chosen": 0.823559582233429, "logits/rejected": 0.7476562261581421, "logps/chosen": -346.6499938964844, "logps/rejected": -330.04998779296875, "loss": 0.5301, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.8975585699081421, "rewards/margins": 1.061376929283142, "rewards/rejected": -0.16423340141773224, "step": 3150 }, { "epoch": 0.8328940432261466, "grad_norm": 132.95487404137444, "learning_rate": 7.918423827095413e-07, "logits/chosen": 0.782763659954071, "logits/rejected": 0.767871081829071, "logps/chosen": -415.1000061035156, "logps/rejected": -350.95001220703125, "loss": 0.6166, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.833740234375, "rewards/margins": 0.988085925579071, "rewards/rejected": -0.15457764267921448, "step": 3160 }, { "epoch": 0.8355297838692672, "grad_norm": 87.12484153469521, "learning_rate": 7.911834475487612e-07, "logits/chosen": 0.926464855670929, "logits/rejected": 0.7928711175918579, "logps/chosen": -382.0, "logps/rejected": -340.79998779296875, "loss": 0.4745, "rewards/accuracies": 0.78125, "rewards/chosen": 1.146875023841858, "rewards/margins": 1.3126952648162842, "rewards/rejected": -0.16567382216453552, "step": 3170 }, { "epoch": 0.838165524512388, "grad_norm": 94.96277243954572, "learning_rate": 7.90524512387981e-07, "logits/chosen": 0.7452148199081421, "logits/rejected": 0.7339843511581421, "logps/chosen": -344.29998779296875, "logps/rejected": -346.04998779296875, "loss": 0.6156, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6341308355331421, "rewards/margins": 0.9959716796875, "rewards/rejected": -0.361572265625, "step": 3180 }, { "epoch": 0.8408012651555087, "grad_norm": 99.27565659124319, "learning_rate": 7.898655772272008e-07, "logits/chosen": 0.8667968511581421, "logits/rejected": 0.9166504144668579, "logps/chosen": -326.6000061035156, "logps/rejected": -340.8999938964844, "loss": 0.6239, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.523632824420929, "rewards/margins": 0.9305664300918579, "rewards/rejected": -0.4075561463832855, "step": 3190 }, { "epoch": 0.8434370057986295, "grad_norm": 78.6517767509134, "learning_rate": 7.892066420664207e-07, "logits/chosen": 0.7304931879043579, "logits/rejected": 0.5843750238418579, "logps/chosen": -381.75, "logps/rejected": -322.29998779296875, "loss": 0.5342, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.5160278081893921, "rewards/margins": 1.2394530773162842, "rewards/rejected": -0.723388671875, "step": 3200 }, { "epoch": 0.8460727464417501, "grad_norm": 102.53350156531882, "learning_rate": 7.885477069056404e-07, "logits/chosen": 0.7498779296875, "logits/rejected": 0.6644531488418579, "logps/chosen": -378.0, "logps/rejected": -316.8500061035156, "loss": 0.5663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.51458740234375, "rewards/margins": 1.138037085533142, "rewards/rejected": -0.6236327886581421, "step": 3210 }, { "epoch": 0.8487084870848709, "grad_norm": 133.69081962717956, "learning_rate": 7.878887717448603e-07, "logits/chosen": 1.0463378429412842, "logits/rejected": 0.762646496295929, "logps/chosen": -370.1499938964844, "logps/rejected": -323.6000061035156, "loss": 0.531, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.611132800579071, "rewards/margins": 1.1902344226837158, "rewards/rejected": -0.5782715082168579, "step": 3220 }, { "epoch": 0.8513442277279916, "grad_norm": 140.2091374942817, "learning_rate": 7.8722983658408e-07, "logits/chosen": 0.765521228313446, "logits/rejected": 0.7457031011581421, "logps/chosen": -361.6499938964844, "logps/rejected": -349.95001220703125, "loss": 0.471, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.6759582757949829, "rewards/margins": 1.5011718273162842, "rewards/rejected": -0.82427978515625, "step": 3230 }, { "epoch": 0.8539799683711122, "grad_norm": 127.39529091166433, "learning_rate": 7.865709014233e-07, "logits/chosen": 0.665942370891571, "logits/rejected": 0.64697265625, "logps/chosen": -346.79998779296875, "logps/rejected": -316.25, "loss": 0.5571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5485595464706421, "rewards/margins": 1.1520507335662842, "rewards/rejected": -0.604327380657196, "step": 3240 }, { "epoch": 0.856615709014233, "grad_norm": 91.30103696503109, "learning_rate": 7.859119662625197e-07, "logits/chosen": 0.6338866949081421, "logits/rejected": 0.6114441156387329, "logps/chosen": -375.25, "logps/rejected": -370.75, "loss": 0.5779, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.508068859577179, "rewards/margins": 1.33642578125, "rewards/rejected": -0.830078125, "step": 3250 }, { "epoch": 0.8592514496573537, "grad_norm": 108.96777664865536, "learning_rate": 7.852530311017396e-07, "logits/chosen": 0.940234363079071, "logits/rejected": 0.8114258050918579, "logps/chosen": -361.54998779296875, "logps/rejected": -351.04998779296875, "loss": 0.5591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.255859375, "rewards/margins": 1.254492163658142, "rewards/rejected": -0.998730480670929, "step": 3260 }, { "epoch": 0.8618871903004744, "grad_norm": 153.7575525264612, "learning_rate": 7.845940959409594e-07, "logits/chosen": 0.6396484375, "logits/rejected": 0.637683093547821, "logps/chosen": -399.8500061035156, "logps/rejected": -361.04998779296875, "loss": 0.5346, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.14191894233226776, "rewards/margins": 1.268774390220642, "rewards/rejected": -1.126806616783142, "step": 3270 }, { "epoch": 0.8645229309435951, "grad_norm": 93.34822925583528, "learning_rate": 7.839351607801792e-07, "logits/chosen": 0.592755138874054, "logits/rejected": 0.6429687738418579, "logps/chosen": -417.0, "logps/rejected": -390.5, "loss": 0.4745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1072998046875, "rewards/margins": 1.415624976158142, "rewards/rejected": -1.3087890148162842, "step": 3280 }, { "epoch": 0.8671586715867159, "grad_norm": 99.77643691224631, "learning_rate": 7.83276225619399e-07, "logits/chosen": 0.4808349609375, "logits/rejected": 0.42292481660842896, "logps/chosen": -363.6499938964844, "logps/rejected": -386.8999938964844, "loss": 0.4628, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23419189453125, "rewards/margins": 1.5192382335662842, "rewards/rejected": -1.284521460533142, "step": 3290 }, { "epoch": 0.8697944122298366, "grad_norm": 148.6637252663985, "learning_rate": 7.826172904586188e-07, "logits/chosen": 0.5908447504043579, "logits/rejected": 0.5545898675918579, "logps/chosen": -398.6000061035156, "logps/rejected": -347.3999938964844, "loss": 0.5479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.016754150390625, "rewards/margins": 1.2353515625, "rewards/rejected": -1.218164086341858, "step": 3300 }, { "epoch": 0.8724301528729573, "grad_norm": 124.88017589937374, "learning_rate": 7.819583552978387e-07, "logits/chosen": 0.4325500428676605, "logits/rejected": 0.4082275331020355, "logps/chosen": -351.8999938964844, "logps/rejected": -335.75, "loss": 0.5146, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.01406249962747097, "rewards/margins": 1.5576660633087158, "rewards/rejected": -1.5419921875, "step": 3310 }, { "epoch": 0.875065893516078, "grad_norm": 122.95186564826537, "learning_rate": 7.812994201370586e-07, "logits/chosen": 0.684008777141571, "logits/rejected": 0.6364074945449829, "logps/chosen": -344.29998779296875, "logps/rejected": -323.0, "loss": 0.663, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.17324218153953552, "rewards/margins": 1.1397216320037842, "rewards/rejected": -1.3129150867462158, "step": 3320 }, { "epoch": 0.8777016341591988, "grad_norm": 107.42943446021617, "learning_rate": 7.806404849762783e-07, "logits/chosen": 0.5906432867050171, "logits/rejected": 0.5652099847793579, "logps/chosen": -365.0, "logps/rejected": -323.8500061035156, "loss": 0.7005, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.03715820237994194, "rewards/margins": 0.8865906000137329, "rewards/rejected": -0.849560558795929, "step": 3330 }, { "epoch": 0.8803373748023194, "grad_norm": 168.53704224794416, "learning_rate": 7.799815498154982e-07, "logits/chosen": 0.75048828125, "logits/rejected": 0.763916015625, "logps/chosen": -325.79998779296875, "logps/rejected": -344.5, "loss": 0.8134, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.20351867377758026, "rewards/margins": 0.93994140625, "rewards/rejected": -1.14599609375, "step": 3340 }, { "epoch": 0.8829731154454402, "grad_norm": 135.8137855813238, "learning_rate": 7.793226146547179e-07, "logits/chosen": 0.6806640625, "logits/rejected": 0.6939941644668579, "logps/chosen": -387.3999938964844, "logps/rejected": -370.8999938964844, "loss": 0.6271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22667846083641052, "rewards/margins": 1.235009789466858, "rewards/rejected": -1.0098145008087158, "step": 3350 }, { "epoch": 0.8856088560885609, "grad_norm": 84.90825730832731, "learning_rate": 7.786636794939378e-07, "logits/chosen": 0.78857421875, "logits/rejected": 0.633496105670929, "logps/chosen": -335.5, "logps/rejected": -324.3500061035156, "loss": 0.512, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.41119384765625, "rewards/margins": 1.353613257408142, "rewards/rejected": -0.943164050579071, "step": 3360 }, { "epoch": 0.8882445967316817, "grad_norm": 134.46614420346575, "learning_rate": 7.780047443331575e-07, "logits/chosen": 0.6684631109237671, "logits/rejected": 0.636792004108429, "logps/chosen": -375.20001220703125, "logps/rejected": -338.0, "loss": 0.5995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09311523288488388, "rewards/margins": 1.079003930091858, "rewards/rejected": -0.9863525629043579, "step": 3370 }, { "epoch": 0.8908803373748023, "grad_norm": 157.4364897366228, "learning_rate": 7.773458091723773e-07, "logits/chosen": 0.8065429925918579, "logits/rejected": 0.727734386920929, "logps/chosen": -338.1000061035156, "logps/rejected": -337.5, "loss": 0.5976, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.29165345430374146, "rewards/margins": 1.1224243640899658, "rewards/rejected": -0.8302459716796875, "step": 3380 }, { "epoch": 0.893516078017923, "grad_norm": 111.55352653229598, "learning_rate": 7.766868740115973e-07, "logits/chosen": 0.7569824457168579, "logits/rejected": 0.555249035358429, "logps/chosen": -363.95001220703125, "logps/rejected": -322.3999938964844, "loss": 0.5479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20435485243797302, "rewards/margins": 1.2931640148162842, "rewards/rejected": -1.089685082435608, "step": 3390 }, { "epoch": 0.8961518186610438, "grad_norm": 186.04734682898837, "learning_rate": 7.76027938850817e-07, "logits/chosen": 0.7884765863418579, "logits/rejected": 0.7704101800918579, "logps/chosen": -378.3999938964844, "logps/rejected": -327.3999938964844, "loss": 0.5295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18853759765625, "rewards/margins": 1.1635010242462158, "rewards/rejected": -0.9744628667831421, "step": 3400 }, { "epoch": 0.8987875593041644, "grad_norm": 138.44348929061488, "learning_rate": 7.753690036900369e-07, "logits/chosen": 0.711596667766571, "logits/rejected": 0.74658203125, "logps/chosen": -331.6499938964844, "logps/rejected": -267.70001220703125, "loss": 0.566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.515637218952179, "rewards/margins": 0.919140636920929, "rewards/rejected": -0.4035278260707855, "step": 3410 }, { "epoch": 0.9014232999472852, "grad_norm": 85.97330050402054, "learning_rate": 7.747100685292566e-07, "logits/chosen": 0.96435546875, "logits/rejected": 0.852734386920929, "logps/chosen": -325.8999938964844, "logps/rejected": -322.25, "loss": 0.6412, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.47926634550094604, "rewards/margins": 0.913159191608429, "rewards/rejected": -0.4339538514614105, "step": 3420 }, { "epoch": 0.9040590405904059, "grad_norm": 107.27220937636403, "learning_rate": 7.740511333684765e-07, "logits/chosen": 0.6792968511581421, "logits/rejected": 0.600781261920929, "logps/chosen": -362.8999938964844, "logps/rejected": -357.54998779296875, "loss": 0.61, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.46760863065719604, "rewards/margins": 0.9997314214706421, "rewards/rejected": -0.5323730707168579, "step": 3430 }, { "epoch": 0.9066947812335266, "grad_norm": 90.28834834801194, "learning_rate": 7.733921982076963e-07, "logits/chosen": 0.811328113079071, "logits/rejected": 0.716845691204071, "logps/chosen": -367.6000061035156, "logps/rejected": -306.6000061035156, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": 0.6958984136581421, "rewards/margins": 1.098413109779358, "rewards/rejected": -0.40260010957717896, "step": 3440 }, { "epoch": 0.9093305218766473, "grad_norm": 135.20026148164445, "learning_rate": 7.727332630469161e-07, "logits/chosen": 0.7269042730331421, "logits/rejected": 0.678387463092804, "logps/chosen": -387.45001220703125, "logps/rejected": -334.3500061035156, "loss": 0.5135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.300668329000473, "rewards/margins": 1.1905517578125, "rewards/rejected": -0.890057384967804, "step": 3450 }, { "epoch": 0.9119662625197681, "grad_norm": 88.46223596851547, "learning_rate": 7.72074327886136e-07, "logits/chosen": 0.6505371332168579, "logits/rejected": 0.682910144329071, "logps/chosen": -351.6000061035156, "logps/rejected": -339.29998779296875, "loss": 0.5491, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09357299655675888, "rewards/margins": 1.154760718345642, "rewards/rejected": -1.060998558998108, "step": 3460 }, { "epoch": 0.9146020031628888, "grad_norm": 153.49998458025968, "learning_rate": 7.714153927253558e-07, "logits/chosen": 0.6907958984375, "logits/rejected": 0.689526379108429, "logps/chosen": -378.1000061035156, "logps/rejected": -379.8500061035156, "loss": 0.6271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03693847730755806, "rewards/margins": 0.967089831829071, "rewards/rejected": -1.0049316883087158, "step": 3470 }, { "epoch": 0.9172377438060095, "grad_norm": 106.30448943303043, "learning_rate": 7.707564575645756e-07, "logits/chosen": 0.662731945514679, "logits/rejected": 0.6167968511581421, "logps/chosen": -365.1000061035156, "logps/rejected": -367.79998779296875, "loss": 0.578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01961059495806694, "rewards/margins": 1.036523461341858, "rewards/rejected": -1.055566430091858, "step": 3480 }, { "epoch": 0.9198734844491302, "grad_norm": 157.69174295816276, "learning_rate": 7.700975224037955e-07, "logits/chosen": 0.5539306402206421, "logits/rejected": 0.522430419921875, "logps/chosen": -371.79998779296875, "logps/rejected": -365.25, "loss": 0.5579, "rewards/accuracies": 0.71875, "rewards/chosen": 0.13087157905101776, "rewards/margins": 1.230676293373108, "rewards/rejected": -1.100488305091858, "step": 3490 }, { "epoch": 0.922509225092251, "grad_norm": 159.20137824427422, "learning_rate": 7.694385872430152e-07, "logits/chosen": 0.697949230670929, "logits/rejected": 0.6927245855331421, "logps/chosen": -375.75, "logps/rejected": -404.6499938964844, "loss": 0.6205, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.39028626680374146, "rewards/margins": 1.287988305091858, "rewards/rejected": -0.898632824420929, "step": 3500 }, { "epoch": 0.9251449657353716, "grad_norm": 54.012729935477886, "learning_rate": 7.687796520822351e-07, "logits/chosen": 0.68408203125, "logits/rejected": 0.6252685785293579, "logps/chosen": -374.25, "logps/rejected": -384.5, "loss": 0.4958, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3644042909145355, "rewards/margins": 1.3347656726837158, "rewards/rejected": -0.9713379144668579, "step": 3510 }, { "epoch": 0.9277807063784923, "grad_norm": 102.26229098051358, "learning_rate": 7.681207169214548e-07, "logits/chosen": 0.7490234375, "logits/rejected": 0.553417980670929, "logps/chosen": -345.25, "logps/rejected": -316.95001220703125, "loss": 0.6992, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21344605088233948, "rewards/margins": 1.0952637195587158, "rewards/rejected": -0.882128894329071, "step": 3520 }, { "epoch": 0.9304164470216131, "grad_norm": 157.44942201219536, "learning_rate": 7.674617817606747e-07, "logits/chosen": 0.8357909917831421, "logits/rejected": 0.6905273199081421, "logps/chosen": -418.45001220703125, "logps/rejected": -384.70001220703125, "loss": 0.5754, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.4633850157260895, "rewards/margins": 1.144921898841858, "rewards/rejected": -0.681640625, "step": 3530 }, { "epoch": 0.9330521876647337, "grad_norm": 100.62542304093085, "learning_rate": 7.668028465998946e-07, "logits/chosen": 0.709521472454071, "logits/rejected": 0.63134765625, "logps/chosen": -401.1000061035156, "logps/rejected": -370.95001220703125, "loss": 0.5498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3612426817417145, "rewards/margins": 1.156396508216858, "rewards/rejected": -0.795458972454071, "step": 3540 }, { "epoch": 0.9356879283078545, "grad_norm": 80.9624348016811, "learning_rate": 7.661439114391144e-07, "logits/chosen": 0.6971679925918579, "logits/rejected": 0.655957043170929, "logps/chosen": -336.0, "logps/rejected": -334.8500061035156, "loss": 0.5329, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4833007752895355, "rewards/margins": 1.47607421875, "rewards/rejected": -0.991503894329071, "step": 3550 }, { "epoch": 0.9383236689509752, "grad_norm": 96.967169928321, "learning_rate": 7.654849762783342e-07, "logits/chosen": 0.722949206829071, "logits/rejected": 0.549072265625, "logps/chosen": -327.25, "logps/rejected": -331.04998779296875, "loss": 0.5333, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3627563416957855, "rewards/margins": 1.372167944908142, "rewards/rejected": -1.0101807117462158, "step": 3560 }, { "epoch": 0.940959409594096, "grad_norm": 144.83763440800453, "learning_rate": 7.64826041117554e-07, "logits/chosen": 0.702832043170929, "logits/rejected": 0.7529236078262329, "logps/chosen": -373.2250061035156, "logps/rejected": -355.45001220703125, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22556152939796448, "rewards/margins": 1.000634789466858, "rewards/rejected": -0.773852527141571, "step": 3570 }, { "epoch": 0.9435951502372166, "grad_norm": 58.328075758550256, "learning_rate": 7.641671059567738e-07, "logits/chosen": 0.6881347894668579, "logits/rejected": 0.65380859375, "logps/chosen": -323.57501220703125, "logps/rejected": -342.0, "loss": 0.5113, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4748474061489105, "rewards/margins": 1.220703125, "rewards/rejected": -0.745312511920929, "step": 3580 }, { "epoch": 0.9462308908803374, "grad_norm": 110.29464163535285, "learning_rate": 7.635081707959936e-07, "logits/chosen": 0.7320312261581421, "logits/rejected": 0.5548340082168579, "logps/chosen": -394.6000061035156, "logps/rejected": -359.04998779296875, "loss": 0.6106, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.500012218952179, "rewards/margins": 1.058203101158142, "rewards/rejected": -0.557385265827179, "step": 3590 }, { "epoch": 0.9488666315234581, "grad_norm": 203.66518932726973, "learning_rate": 7.628492356352134e-07, "logits/chosen": 0.69677734375, "logits/rejected": 0.747387707233429, "logps/chosen": -364.04998779296875, "logps/rejected": -339.29998779296875, "loss": 0.5903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5816589593887329, "rewards/margins": 1.1959960460662842, "rewards/rejected": -0.612902820110321, "step": 3600 }, { "epoch": 0.9515023721665788, "grad_norm": 123.69606589447359, "learning_rate": 7.621903004744334e-07, "logits/chosen": 0.888671875, "logits/rejected": 0.764636218547821, "logps/chosen": -411.79998779296875, "logps/rejected": -365.6000061035156, "loss": 0.6347, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.7493652105331421, "rewards/margins": 1.129003882408142, "rewards/rejected": -0.38041990995407104, "step": 3610 }, { "epoch": 0.9541381128096995, "grad_norm": 117.41647891128841, "learning_rate": 7.615313653136531e-07, "logits/chosen": 0.776562511920929, "logits/rejected": 0.6075683832168579, "logps/chosen": -406.6000061035156, "logps/rejected": -326.95001220703125, "loss": 0.4646, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8535682559013367, "rewards/margins": 1.3860352039337158, "rewards/rejected": -0.531567394733429, "step": 3620 }, { "epoch": 0.9567738534528203, "grad_norm": 114.56677464524358, "learning_rate": 7.60872430152873e-07, "logits/chosen": 0.6515868902206421, "logits/rejected": 0.600146472454071, "logps/chosen": -372.8999938964844, "logps/rejected": -344.25, "loss": 0.6898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.572094738483429, "rewards/margins": 0.953686535358429, "rewards/rejected": -0.3817138671875, "step": 3630 }, { "epoch": 0.959409594095941, "grad_norm": 126.00273708391951, "learning_rate": 7.602134949920927e-07, "logits/chosen": 0.7997070550918579, "logits/rejected": 0.76275634765625, "logps/chosen": -381.79998779296875, "logps/rejected": -364.70001220703125, "loss": 0.5029, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.916333019733429, "rewards/margins": 1.33154296875, "rewards/rejected": -0.41483765840530396, "step": 3640 }, { "epoch": 0.9620453347390617, "grad_norm": 104.65829579660844, "learning_rate": 7.595545598313126e-07, "logits/chosen": 0.6598876714706421, "logits/rejected": 0.696972668170929, "logps/chosen": -366.29998779296875, "logps/rejected": -328.25, "loss": 0.5862, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.893994152545929, "rewards/margins": 0.991381824016571, "rewards/rejected": -0.09689941257238388, "step": 3650 }, { "epoch": 0.9646810753821824, "grad_norm": 84.98983592022526, "learning_rate": 7.588956246705324e-07, "logits/chosen": 0.9296875, "logits/rejected": 0.920654296875, "logps/chosen": -341.7250061035156, "logps/rejected": -343.1000061035156, "loss": 0.5546, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.966748058795929, "rewards/margins": 1.2492187023162842, "rewards/rejected": -0.2834228575229645, "step": 3660 }, { "epoch": 0.9673168160253031, "grad_norm": 153.98415183721562, "learning_rate": 7.582366895097522e-07, "logits/chosen": 0.839794933795929, "logits/rejected": 0.77197265625, "logps/chosen": -334.5, "logps/rejected": -362.29998779296875, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": 0.807751476764679, "rewards/margins": 0.942675769329071, "rewards/rejected": -0.134837344288826, "step": 3670 }, { "epoch": 0.9699525566684238, "grad_norm": 68.82435384880625, "learning_rate": 7.57577754348972e-07, "logits/chosen": 0.774707019329071, "logits/rejected": 0.6963866949081421, "logps/chosen": -352.79998779296875, "logps/rejected": -352.29998779296875, "loss": 0.4903, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6348205804824829, "rewards/margins": 1.4351074695587158, "rewards/rejected": -0.799853503704071, "step": 3680 }, { "epoch": 0.9725882973115445, "grad_norm": 74.1231558128489, "learning_rate": 7.569188191881919e-07, "logits/chosen": 0.65576171875, "logits/rejected": 0.61083984375, "logps/chosen": -357.0, "logps/rejected": -325.29998779296875, "loss": 0.5071, "rewards/accuracies": 0.71875, "rewards/chosen": 0.53485107421875, "rewards/margins": 1.2610352039337158, "rewards/rejected": -0.7273772954940796, "step": 3690 }, { "epoch": 0.9752240379546653, "grad_norm": 127.48655710114788, "learning_rate": 7.562598840274117e-07, "logits/chosen": 0.5782715082168579, "logits/rejected": 0.5623534917831421, "logps/chosen": -394.45001220703125, "logps/rejected": -378.8500061035156, "loss": 0.6717, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07903747260570526, "rewards/margins": 1.0023193359375, "rewards/rejected": -1.0808837413787842, "step": 3700 }, { "epoch": 0.977859778597786, "grad_norm": 119.50398545185949, "learning_rate": 7.556009488666316e-07, "logits/chosen": 0.8853515386581421, "logits/rejected": 0.8216797113418579, "logps/chosen": -415.3999938964844, "logps/rejected": -383.79998779296875, "loss": 0.6185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23383788764476776, "rewards/margins": 1.2122070789337158, "rewards/rejected": -0.9784911870956421, "step": 3710 }, { "epoch": 0.9804955192409067, "grad_norm": 111.786914312192, "learning_rate": 7.549420137058513e-07, "logits/chosen": 0.830517590045929, "logits/rejected": 0.8953613042831421, "logps/chosen": -379.5, "logps/rejected": -359.6499938964844, "loss": 0.4846, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6990966796875, "rewards/margins": 1.1738770008087158, "rewards/rejected": -0.4737304747104645, "step": 3720 }, { "epoch": 0.9831312598840274, "grad_norm": 115.40573853320078, "learning_rate": 7.542830785450712e-07, "logits/chosen": 0.945343017578125, "logits/rejected": 0.852734386920929, "logps/chosen": -352.29998779296875, "logps/rejected": -356.3999938964844, "loss": 0.5745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7889648675918579, "rewards/margins": 1.0686523914337158, "rewards/rejected": -0.2786193788051605, "step": 3730 }, { "epoch": 0.9857670005271482, "grad_norm": 197.4827145029206, "learning_rate": 7.536241433842909e-07, "logits/chosen": 0.6687682867050171, "logits/rejected": 0.759796142578125, "logps/chosen": -357.2250061035156, "logps/rejected": -344.3500061035156, "loss": 0.6011, "rewards/accuracies": 0.6875, "rewards/chosen": 0.795581042766571, "rewards/margins": 0.95751953125, "rewards/rejected": -0.16160888969898224, "step": 3740 }, { "epoch": 0.9884027411702688, "grad_norm": 146.1202778592738, "learning_rate": 7.529652082235107e-07, "logits/chosen": 0.9646972417831421, "logits/rejected": 0.81103515625, "logps/chosen": -379.1000061035156, "logps/rejected": -362.04998779296875, "loss": 0.6188, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.667651355266571, "rewards/margins": 1.0082519054412842, "rewards/rejected": -0.3399414122104645, "step": 3750 }, { "epoch": 0.9910384818133896, "grad_norm": 121.06745708708738, "learning_rate": 7.523062730627306e-07, "logits/chosen": 0.78564453125, "logits/rejected": 0.76806640625, "logps/chosen": -396.70001220703125, "logps/rejected": -379.5, "loss": 0.606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.45439451932907104, "rewards/margins": 1.0714843273162842, "rewards/rejected": -0.616992175579071, "step": 3760 }, { "epoch": 0.9936742224565103, "grad_norm": 129.85949740836242, "learning_rate": 7.516473379019504e-07, "logits/chosen": 0.8622802495956421, "logits/rejected": 0.814160168170929, "logps/chosen": -364.29998779296875, "logps/rejected": -342.1499938964844, "loss": 0.5778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6857665777206421, "rewards/margins": 1.077233910560608, "rewards/rejected": -0.3930816650390625, "step": 3770 }, { "epoch": 0.996309963099631, "grad_norm": 112.025898224188, "learning_rate": 7.509884027411703e-07, "logits/chosen": 0.898242175579071, "logits/rejected": 0.7000976800918579, "logps/chosen": -413.1499938964844, "logps/rejected": -354.25, "loss": 0.5976, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.6275268793106079, "rewards/margins": 1.1096680164337158, "rewards/rejected": -0.4826904237270355, "step": 3780 }, { "epoch": 0.9989457037427517, "grad_norm": 138.25322668157452, "learning_rate": 7.5032946758039e-07, "logits/chosen": 0.8641601800918579, "logits/rejected": 0.747851550579071, "logps/chosen": -358.1000061035156, "logps/rejected": -351.42498779296875, "loss": 0.7097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4690185487270355, "rewards/margins": 0.840258777141571, "rewards/rejected": -0.372314453125, "step": 3790 }, { "epoch": 1.0015814443858724, "grad_norm": 31.594140930177005, "learning_rate": 7.496705324196099e-07, "logits/chosen": 0.89501953125, "logits/rejected": 0.814453125, "logps/chosen": -340.67498779296875, "logps/rejected": -340.3999938964844, "loss": 0.2091, "rewards/accuracies": 0.9174998998641968, "rewards/chosen": 1.203027367591858, "rewards/margins": 3.0033202171325684, "rewards/rejected": -1.8025391101837158, "step": 3800 }, { "epoch": 1.0042171850289932, "grad_norm": 19.64165047684801, "learning_rate": 7.490115972588296e-07, "logits/chosen": 0.735888659954071, "logits/rejected": 0.5679687261581421, "logps/chosen": -293.67498779296875, "logps/rejected": -349.20001220703125, "loss": 0.0965, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2963378429412842, "rewards/margins": 3.746875047683716, "rewards/rejected": -2.451367139816284, "step": 3810 }, { "epoch": 1.006852925672114, "grad_norm": 28.916504549671163, "learning_rate": 7.483526620980495e-07, "logits/chosen": 0.6106628179550171, "logits/rejected": 0.4971923828125, "logps/chosen": -362.79998779296875, "logps/rejected": -393.6000061035156, "loss": 0.1083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.49365234375, "rewards/margins": 4.324999809265137, "rewards/rejected": -2.831249952316284, "step": 3820 }, { "epoch": 1.0094886663152345, "grad_norm": 14.680237099040212, "learning_rate": 7.476937269372693e-07, "logits/chosen": 0.4355529844760895, "logits/rejected": 0.31585997343063354, "logps/chosen": -399.04998779296875, "logps/rejected": -376.3999938964844, "loss": 0.094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.384374976158142, "rewards/margins": 4.471093654632568, "rewards/rejected": -3.0863280296325684, "step": 3830 }, { "epoch": 1.0121244069583553, "grad_norm": 37.629528815115734, "learning_rate": 7.470347917764892e-07, "logits/chosen": 0.2619796693325043, "logits/rejected": 0.22304686903953552, "logps/chosen": -345.20001220703125, "logps/rejected": -374.20001220703125, "loss": 0.1083, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.7112182378768921, "rewards/margins": 4.365624904632568, "rewards/rejected": -3.6578125953674316, "step": 3840 }, { "epoch": 1.014760147601476, "grad_norm": 31.65899793818623, "learning_rate": 7.46375856615709e-07, "logits/chosen": 0.16011962294578552, "logits/rejected": 0.14202575385570526, "logps/chosen": -374.04998779296875, "logps/rejected": -416.20001220703125, "loss": 0.1254, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5967041254043579, "rewards/margins": 4.817968845367432, "rewards/rejected": -4.220312595367432, "step": 3850 }, { "epoch": 1.0173958882445968, "grad_norm": 36.35744979865505, "learning_rate": 7.457169214549288e-07, "logits/chosen": 0.3855520188808441, "logits/rejected": 0.20976562798023224, "logps/chosen": -402.79998779296875, "logps/rejected": -395.6499938964844, "loss": 0.1126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5539306402206421, "rewards/margins": 4.517187595367432, "rewards/rejected": -3.958984375, "step": 3860 }, { "epoch": 1.0200316288877174, "grad_norm": 16.888530267702762, "learning_rate": 7.450579862941486e-07, "logits/chosen": 0.4399780333042145, "logits/rejected": 0.24610748887062073, "logps/chosen": -374.04998779296875, "logps/rejected": -385.20001220703125, "loss": 0.1062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.83154296875, "rewards/margins": 4.749218940734863, "rewards/rejected": -3.917187452316284, "step": 3870 }, { "epoch": 1.0226673695308381, "grad_norm": 32.2562439622776, "learning_rate": 7.443990511333684e-07, "logits/chosen": 0.39789122343063354, "logits/rejected": 0.33372193574905396, "logps/chosen": -373.1499938964844, "logps/rejected": -364.3999938964844, "loss": 0.0926, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.42822265625, "rewards/margins": 4.169531345367432, "rewards/rejected": -2.744140625, "step": 3880 }, { "epoch": 1.025303110173959, "grad_norm": 22.04272184011361, "learning_rate": 7.437401159725882e-07, "logits/chosen": 0.75, "logits/rejected": 0.6440185308456421, "logps/chosen": -358.6000061035156, "logps/rejected": -383.1000061035156, "loss": 0.1159, "rewards/accuracies": 0.96875, "rewards/chosen": 1.422753930091858, "rewards/margins": 4.299218654632568, "rewards/rejected": -2.876953125, "step": 3890 }, { "epoch": 1.0279388508170797, "grad_norm": 28.405690990311363, "learning_rate": 7.430811808118081e-07, "logits/chosen": 0.5026611089706421, "logits/rejected": 0.4567016661167145, "logps/chosen": -367.0, "logps/rejected": -392.1000061035156, "loss": 0.1134, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2739746570587158, "rewards/margins": 4.364843845367432, "rewards/rejected": -3.0921874046325684, "step": 3900 }, { "epoch": 1.0305745914602003, "grad_norm": 20.35692111736546, "learning_rate": 7.424222456510279e-07, "logits/chosen": 0.5008178949356079, "logits/rejected": 0.45518797636032104, "logps/chosen": -382.5, "logps/rejected": -379.3999938964844, "loss": 0.1073, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.984021008014679, "rewards/margins": 4.40234375, "rewards/rejected": -3.4183592796325684, "step": 3910 }, { "epoch": 1.033210332103321, "grad_norm": 29.291963049762987, "learning_rate": 7.417633104902478e-07, "logits/chosen": 0.2986694276332855, "logits/rejected": 0.2710815370082855, "logps/chosen": -349.95001220703125, "logps/rejected": -358.0, "loss": 0.0887, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.883837878704071, "rewards/margins": 4.560156345367432, "rewards/rejected": -3.6812500953674316, "step": 3920 }, { "epoch": 1.0358460727464418, "grad_norm": 28.394746471400435, "learning_rate": 7.411043753294675e-07, "logits/chosen": 0.3136230409145355, "logits/rejected": 0.27001953125, "logps/chosen": -341.70001220703125, "logps/rejected": -362.8999938964844, "loss": 0.1278, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6605468988418579, "rewards/margins": 4.32421875, "rewards/rejected": -3.6644530296325684, "step": 3930 }, { "epoch": 1.0384818133895624, "grad_norm": 17.895190323283913, "learning_rate": 7.404454401686874e-07, "logits/chosen": 0.33660888671875, "logits/rejected": 0.30914306640625, "logps/chosen": -384.70001220703125, "logps/rejected": -401.75, "loss": 0.1152, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.881335437297821, "rewards/margins": 4.68359375, "rewards/rejected": -3.803906202316284, "step": 3940 }, { "epoch": 1.0411175540326831, "grad_norm": 23.14516989640528, "learning_rate": 7.397865050079072e-07, "logits/chosen": 0.3578735291957855, "logits/rejected": 0.32518309354782104, "logps/chosen": -342.3999938964844, "logps/rejected": -379.20001220703125, "loss": 0.0806, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.151464819908142, "rewards/margins": 4.579687595367432, "rewards/rejected": -3.4320311546325684, "step": 3950 }, { "epoch": 1.043753294675804, "grad_norm": 19.022741107239344, "learning_rate": 7.39127569847127e-07, "logits/chosen": 0.5271056890487671, "logits/rejected": 0.38847047090530396, "logps/chosen": -358.20001220703125, "logps/rejected": -383.70001220703125, "loss": 0.096, "rewards/accuracies": 0.96875, "rewards/chosen": 1.129968285560608, "rewards/margins": 4.582812309265137, "rewards/rejected": -3.452343702316284, "step": 3960 }, { "epoch": 1.0463890353189247, "grad_norm": 16.130608464685846, "learning_rate": 7.384686346863468e-07, "logits/chosen": 0.5523437261581421, "logits/rejected": 0.4036010801792145, "logps/chosen": -394.5, "logps/rejected": -395.20001220703125, "loss": 0.1163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.24725341796875, "rewards/margins": 4.551562309265137, "rewards/rejected": -3.301953077316284, "step": 3970 }, { "epoch": 1.0490247759620452, "grad_norm": 66.34157321042706, "learning_rate": 7.378096995255666e-07, "logits/chosen": 0.35364073514938354, "logits/rejected": 0.3245605528354645, "logps/chosen": -367.75, "logps/rejected": -415.8500061035156, "loss": 0.0958, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0299804210662842, "rewards/margins": 4.992968559265137, "rewards/rejected": -3.9625000953674316, "step": 3980 }, { "epoch": 1.051660516605166, "grad_norm": 15.819389294052003, "learning_rate": 7.371507643647865e-07, "logits/chosen": 0.522412121295929, "logits/rejected": 0.36846923828125, "logps/chosen": -375.8999938964844, "logps/rejected": -354.5, "loss": 0.1055, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.498779296875, "rewards/margins": 4.305468559265137, "rewards/rejected": -3.8070311546325684, "step": 3990 }, { "epoch": 1.0542962572482868, "grad_norm": 36.99411199763621, "learning_rate": 7.364918292040064e-07, "logits/chosen": 0.35883790254592896, "logits/rejected": 0.240234375, "logps/chosen": -396.8500061035156, "logps/rejected": -394.79998779296875, "loss": 0.1155, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.541674792766571, "rewards/margins": 4.5625, "rewards/rejected": -4.021874904632568, "step": 4000 }, { "epoch": 1.0569319978914076, "grad_norm": 28.605141451757365, "learning_rate": 7.358328940432261e-07, "logits/chosen": 0.37199705839157104, "logits/rejected": 0.35638427734375, "logps/chosen": -308.79998779296875, "logps/rejected": -327.8500061035156, "loss": 0.1225, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.43084716796875, "rewards/margins": 4.239843845367432, "rewards/rejected": -3.80859375, "step": 4010 }, { "epoch": 1.0595677385345281, "grad_norm": 38.24149862493702, "learning_rate": 7.35173958882446e-07, "logits/chosen": 0.4095825254917145, "logits/rejected": 0.35224610567092896, "logps/chosen": -405.79998779296875, "logps/rejected": -405.6000061035156, "loss": 0.0902, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9212890863418579, "rewards/margins": 4.822656154632568, "rewards/rejected": -3.899218797683716, "step": 4020 }, { "epoch": 1.062203479177649, "grad_norm": 40.76614571277496, "learning_rate": 7.345150237216657e-07, "logits/chosen": 0.24643555283546448, "logits/rejected": 0.2695556581020355, "logps/chosen": -366.25, "logps/rejected": -428.79998779296875, "loss": 0.0933, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21208496391773224, "rewards/margins": 4.686718940734863, "rewards/rejected": -4.474999904632568, "step": 4030 }, { "epoch": 1.0648392198207697, "grad_norm": 41.73167860978793, "learning_rate": 7.338560885608856e-07, "logits/chosen": 0.13322143256664276, "logits/rejected": 0.0374755859375, "logps/chosen": -387.6000061035156, "logps/rejected": -379.70001220703125, "loss": 0.0798, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.259857177734375, "rewards/margins": 4.582812309265137, "rewards/rejected": -4.321875095367432, "step": 4040 }, { "epoch": 1.0674749604638905, "grad_norm": 68.33578240912249, "learning_rate": 7.331971534001053e-07, "logits/chosen": 0.20774230360984802, "logits/rejected": -0.01596679724752903, "logps/chosen": -356.79998779296875, "logps/rejected": -367.29998779296875, "loss": 0.0992, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22170409560203552, "rewards/margins": 4.861718654632568, "rewards/rejected": -4.635937690734863, "step": 4050 }, { "epoch": 1.070110701107011, "grad_norm": 57.33073226751041, "learning_rate": 7.325382182393253e-07, "logits/chosen": 0.16217955946922302, "logits/rejected": 0.10169067233800888, "logps/chosen": -351.95001220703125, "logps/rejected": -366.8999938964844, "loss": 0.1035, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5057373046875, "rewards/margins": 4.87890625, "rewards/rejected": -4.373437404632568, "step": 4060 }, { "epoch": 1.0727464417501318, "grad_norm": 35.914760350270626, "learning_rate": 7.318792830785451e-07, "logits/chosen": 0.34576416015625, "logits/rejected": 0.20513305068016052, "logps/chosen": -375.54998779296875, "logps/rejected": -345.8500061035156, "loss": 0.097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.068750023841858, "rewards/margins": 4.938281059265137, "rewards/rejected": -3.8695311546325684, "step": 4070 }, { "epoch": 1.0753821823932526, "grad_norm": 16.975847900609804, "learning_rate": 7.312203479177649e-07, "logits/chosen": 0.27680665254592896, "logits/rejected": 0.12322998046875, "logps/chosen": -364.79998779296875, "logps/rejected": -374.45001220703125, "loss": 0.0856, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.7085937261581421, "rewards/margins": 4.807812690734863, "rewards/rejected": -4.103125095367432, "step": 4080 }, { "epoch": 1.0780179230363731, "grad_norm": 21.98429833946139, "learning_rate": 7.305614127569847e-07, "logits/chosen": 0.26628798246383667, "logits/rejected": 0.0997314453125, "logps/chosen": -350.25, "logps/rejected": -354.25, "loss": 0.0749, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5536133050918579, "rewards/margins": 4.595312595367432, "rewards/rejected": -4.038281440734863, "step": 4090 }, { "epoch": 1.080653663679494, "grad_norm": 17.315618754411034, "learning_rate": 7.299024775962044e-07, "logits/chosen": 0.05018310621380806, "logits/rejected": 0.0036651610862463713, "logps/chosen": -364.3500061035156, "logps/rejected": -384.8999938964844, "loss": 0.091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5407348871231079, "rewards/margins": 5.378125190734863, "rewards/rejected": -4.839453220367432, "step": 4100 }, { "epoch": 1.0832894043226147, "grad_norm": 10.402245855924118, "learning_rate": 7.292435424354243e-07, "logits/chosen": 0.15143433213233948, "logits/rejected": -0.06084594875574112, "logps/chosen": -417.1499938964844, "logps/rejected": -388.8999938964844, "loss": 0.0934, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4214721620082855, "rewards/margins": 5.236718654632568, "rewards/rejected": -4.814843654632568, "step": 4110 }, { "epoch": 1.0859251449657354, "grad_norm": 102.67272486712979, "learning_rate": 7.285846072746441e-07, "logits/chosen": 0.286520391702652, "logits/rejected": 0.13801880180835724, "logps/chosen": -354.8500061035156, "logps/rejected": -381.1000061035156, "loss": 0.1195, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.02143554762005806, "rewards/margins": 4.615624904632568, "rewards/rejected": -4.632031440734863, "step": 4120 }, { "epoch": 1.088560885608856, "grad_norm": 14.746115377978438, "learning_rate": 7.279256721138639e-07, "logits/chosen": 0.01683959923684597, "logits/rejected": -0.10173340141773224, "logps/chosen": -379.3999938964844, "logps/rejected": -387.54998779296875, "loss": 0.1122, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.03458251804113388, "rewards/margins": 4.827343940734863, "rewards/rejected": -4.795312404632568, "step": 4130 }, { "epoch": 1.0911966262519768, "grad_norm": 47.3646804844911, "learning_rate": 7.272667369530838e-07, "logits/chosen": 0.18707275390625, "logits/rejected": 0.04912109300494194, "logps/chosen": -350.6000061035156, "logps/rejected": -373.04998779296875, "loss": 0.1109, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.28267210721969604, "rewards/margins": 4.949999809265137, "rewards/rejected": -4.664843559265137, "step": 4140 }, { "epoch": 1.0938323668950976, "grad_norm": 17.49198046249453, "learning_rate": 7.266078017923036e-07, "logits/chosen": 0.19698485732078552, "logits/rejected": 0.16043701767921448, "logps/chosen": -343.25, "logps/rejected": -336.20001220703125, "loss": 0.107, "rewards/accuracies": 0.96875, "rewards/chosen": 0.863964855670929, "rewards/margins": 4.397656440734863, "rewards/rejected": -3.53515625, "step": 4150 }, { "epoch": 1.0964681075382183, "grad_norm": 18.03753766755064, "learning_rate": 7.259488666315234e-07, "logits/chosen": 0.3450378477573395, "logits/rejected": 0.33392333984375, "logps/chosen": -322.6499938964844, "logps/rejected": -306.3500061035156, "loss": 0.0941, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.195556640625, "rewards/margins": 4.771874904632568, "rewards/rejected": -3.576171875, "step": 4160 }, { "epoch": 1.0991038481813389, "grad_norm": 38.81026806587176, "learning_rate": 7.252899314707433e-07, "logits/chosen": 0.41346436738967896, "logits/rejected": 0.32574462890625, "logps/chosen": -339.54998779296875, "logps/rejected": -357.3999938964844, "loss": 0.1187, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1484375, "rewards/margins": 4.733593940734863, "rewards/rejected": -3.581249952316284, "step": 4170 }, { "epoch": 1.1017395888244597, "grad_norm": 21.99464057399354, "learning_rate": 7.24630996309963e-07, "logits/chosen": 0.38641357421875, "logits/rejected": 0.19942016899585724, "logps/chosen": -366.1000061035156, "logps/rejected": -414.6000061035156, "loss": 0.0687, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.284570336341858, "rewards/margins": 4.997656345367432, "rewards/rejected": -3.709765672683716, "step": 4180 }, { "epoch": 1.1043753294675804, "grad_norm": 17.51748250794626, "learning_rate": 7.239720611491829e-07, "logits/chosen": 0.33956223726272583, "logits/rejected": 0.25080567598342896, "logps/chosen": -339.8500061035156, "logps/rejected": -363.5, "loss": 0.0802, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8569091558456421, "rewards/margins": 4.711718559265137, "rewards/rejected": -3.8539061546325684, "step": 4190 }, { "epoch": 1.1070110701107012, "grad_norm": 19.417518709601143, "learning_rate": 7.233131259884026e-07, "logits/chosen": 0.2588134706020355, "logits/rejected": 0.09993896633386612, "logps/chosen": -381.3500061035156, "logps/rejected": -357.8999938964844, "loss": 0.1008, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8552185297012329, "rewards/margins": 5.083593845367432, "rewards/rejected": -4.227343559265137, "step": 4200 }, { "epoch": 1.1096468107538218, "grad_norm": 68.0193136838801, "learning_rate": 7.226541908276226e-07, "logits/chosen": 0.20946045219898224, "logits/rejected": 0.25239866971969604, "logps/chosen": -376.0, "logps/rejected": -375.1000061035156, "loss": 0.0952, "rewards/accuracies": 0.96875, "rewards/chosen": 1.164892554283142, "rewards/margins": 4.799218654632568, "rewards/rejected": -3.6328125, "step": 4210 }, { "epoch": 1.1122825513969425, "grad_norm": 14.392960104469383, "learning_rate": 7.219952556668424e-07, "logits/chosen": 0.19267578423023224, "logits/rejected": 0.07356567680835724, "logps/chosen": -359.5, "logps/rejected": -366.29998779296875, "loss": 0.1106, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6741088628768921, "rewards/margins": 4.400781154632568, "rewards/rejected": -3.7249999046325684, "step": 4220 }, { "epoch": 1.1149182920400633, "grad_norm": 52.62758461245308, "learning_rate": 7.213363205060622e-07, "logits/chosen": 0.24589844048023224, "logits/rejected": 0.17322082817554474, "logps/chosen": -337.70001220703125, "logps/rejected": -350.6000061035156, "loss": 0.1549, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.620288074016571, "rewards/margins": 4.146093845367432, "rewards/rejected": -3.52734375, "step": 4230 }, { "epoch": 1.1175540326831839, "grad_norm": 23.41395408299418, "learning_rate": 7.20677385345282e-07, "logits/chosen": 0.25812989473342896, "logits/rejected": 0.07203368842601776, "logps/chosen": -401.29998779296875, "logps/rejected": -388.3500061035156, "loss": 0.0883, "rewards/accuracies": 0.96875, "rewards/chosen": 1.138671875, "rewards/margins": 4.608593940734863, "rewards/rejected": -3.473437547683716, "step": 4240 }, { "epoch": 1.1201897733263046, "grad_norm": 41.540408267232664, "learning_rate": 7.200184501845018e-07, "logits/chosen": 0.18328857421875, "logits/rejected": 0.087158203125, "logps/chosen": -359.20001220703125, "logps/rejected": -378.20001220703125, "loss": 0.0965, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.85491943359375, "rewards/margins": 4.65234375, "rewards/rejected": -3.80078125, "step": 4250 }, { "epoch": 1.1228255139694254, "grad_norm": 30.907445162260565, "learning_rate": 7.193595150237216e-07, "logits/chosen": 0.285369873046875, "logits/rejected": 0.10084228217601776, "logps/chosen": -426.6000061035156, "logps/rejected": -399.3999938964844, "loss": 0.1049, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.502368152141571, "rewards/margins": 4.839062690734863, "rewards/rejected": -4.337500095367432, "step": 4260 }, { "epoch": 1.1254612546125462, "grad_norm": 25.43515131145459, "learning_rate": 7.187005798629414e-07, "logits/chosen": 0.4171142578125, "logits/rejected": 0.14002685248851776, "logps/chosen": -371.8500061035156, "logps/rejected": -418.70001220703125, "loss": 0.1225, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.6870361566543579, "rewards/margins": 4.971093654632568, "rewards/rejected": -4.285937309265137, "step": 4270 }, { "epoch": 1.1280969952556668, "grad_norm": 35.18711710083787, "learning_rate": 7.180416447021612e-07, "logits/chosen": 0.05230712890625, "logits/rejected": 0.0684661865234375, "logps/chosen": -367.3999938964844, "logps/rejected": -421.1499938964844, "loss": 0.0919, "rewards/accuracies": 0.96875, "rewards/chosen": 0.615527331829071, "rewards/margins": 5.33984375, "rewards/rejected": -4.723437309265137, "step": 4280 }, { "epoch": 1.1307327358987875, "grad_norm": 69.8963847098789, "learning_rate": 7.173827095413812e-07, "logits/chosen": 0.3185791075229645, "logits/rejected": 0.2403564453125, "logps/chosen": -392.1499938964844, "logps/rejected": -400.70001220703125, "loss": 0.1294, "rewards/accuracies": 0.9375, "rewards/chosen": 0.433837890625, "rewards/margins": 4.712500095367432, "rewards/rejected": -4.278906345367432, "step": 4290 }, { "epoch": 1.1333684765419083, "grad_norm": 22.651966647361842, "learning_rate": 7.167237743806009e-07, "logits/chosen": 0.23542480170726776, "logits/rejected": 0.05382080003619194, "logps/chosen": -382.54998779296875, "logps/rejected": -376.6000061035156, "loss": 0.0881, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8003631830215454, "rewards/margins": 4.657812595367432, "rewards/rejected": -3.856250047683716, "step": 4300 }, { "epoch": 1.136004217185029, "grad_norm": 26.8473305211961, "learning_rate": 7.160648392198208e-07, "logits/chosen": 0.23316650092601776, "logits/rejected": 0.21799926459789276, "logps/chosen": -387.95001220703125, "logps/rejected": -437.79998779296875, "loss": 0.0806, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.730603039264679, "rewards/margins": 5.385156154632568, "rewards/rejected": -4.657031059265137, "step": 4310 }, { "epoch": 1.1386399578281496, "grad_norm": 44.9731554235401, "learning_rate": 7.154059040590405e-07, "logits/chosen": 0.13201904296875, "logits/rejected": -0.09998778998851776, "logps/chosen": -409.04998779296875, "logps/rejected": -432.3999938964844, "loss": 0.0926, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.20847778022289276, "rewards/margins": 5.236718654632568, "rewards/rejected": -5.0234375, "step": 4320 }, { "epoch": 1.1412756984712704, "grad_norm": 20.267106860094067, "learning_rate": 7.147469688982604e-07, "logits/chosen": 0.21224670112133026, "logits/rejected": 0.0477294921875, "logps/chosen": -379.95001220703125, "logps/rejected": -380.04998779296875, "loss": 0.1003, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.40068358182907104, "rewards/margins": 5.133593559265137, "rewards/rejected": -4.736718654632568, "step": 4330 }, { "epoch": 1.1439114391143912, "grad_norm": 39.351576208938674, "learning_rate": 7.140880337374802e-07, "logits/chosen": 0.2520385682582855, "logits/rejected": 0.3315063416957855, "logps/chosen": -381.3500061035156, "logps/rejected": -433.70001220703125, "loss": 0.0919, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.687853991985321, "rewards/margins": 5.180468559265137, "rewards/rejected": -4.4921875, "step": 4340 }, { "epoch": 1.146547179757512, "grad_norm": 41.28909323058041, "learning_rate": 7.134290985767e-07, "logits/chosen": 0.3379760682582855, "logits/rejected": 0.08563842624425888, "logps/chosen": -341.04998779296875, "logps/rejected": -345.6499938964844, "loss": 0.1312, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8504394292831421, "rewards/margins": 4.521093845367432, "rewards/rejected": -3.672656297683716, "step": 4350 }, { "epoch": 1.1491829204006325, "grad_norm": 18.216635542010025, "learning_rate": 7.127701634159199e-07, "logits/chosen": 0.225494384765625, "logits/rejected": 0.09736327826976776, "logps/chosen": -370.6000061035156, "logps/rejected": -389.25, "loss": 0.0865, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.9610351324081421, "rewards/margins": 5.196093559265137, "rewards/rejected": -4.235547065734863, "step": 4360 }, { "epoch": 1.1518186610437533, "grad_norm": 17.908395608628094, "learning_rate": 7.121112282551397e-07, "logits/chosen": 0.20873717963695526, "logits/rejected": 0.05540161207318306, "logps/chosen": -388.1499938964844, "logps/rejected": -404.0, "loss": 0.0854, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8751220703125, "rewards/margins": 4.671875, "rewards/rejected": -3.797656297683716, "step": 4370 }, { "epoch": 1.154454401686874, "grad_norm": 43.5479439229367, "learning_rate": 7.114522930943595e-07, "logits/chosen": 0.3413452208042145, "logits/rejected": 0.16683349013328552, "logps/chosen": -373.79998779296875, "logps/rejected": -361.1000061035156, "loss": 0.0997, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9920898675918579, "rewards/margins": 4.787499904632568, "rewards/rejected": -3.799999952316284, "step": 4380 }, { "epoch": 1.1570901423299946, "grad_norm": 17.2214436339624, "learning_rate": 7.107933579335793e-07, "logits/chosen": 0.26837158203125, "logits/rejected": 0.12669678032398224, "logps/chosen": -382.75, "logps/rejected": -361.54998779296875, "loss": 0.1068, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.633593738079071, "rewards/margins": 4.809374809265137, "rewards/rejected": -4.174218654632568, "step": 4390 }, { "epoch": 1.1597258829731154, "grad_norm": 15.123055037652136, "learning_rate": 7.101344227727991e-07, "logits/chosen": 0.383544921875, "logits/rejected": 0.14621277153491974, "logps/chosen": -407.3999938964844, "logps/rejected": -390.3500061035156, "loss": 0.0962, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.023584008216858, "rewards/margins": 4.654687404632568, "rewards/rejected": -3.633593797683716, "step": 4400 }, { "epoch": 1.1623616236162362, "grad_norm": 10.53902554914139, "learning_rate": 7.09475487612019e-07, "logits/chosen": 0.33782958984375, "logits/rejected": 0.01571044884622097, "logps/chosen": -384.1499938964844, "logps/rejected": -371.75, "loss": 0.1151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.610424816608429, "rewards/margins": 4.622656345367432, "rewards/rejected": -4.010156154632568, "step": 4410 }, { "epoch": 1.164997364259357, "grad_norm": 29.690437118612177, "learning_rate": 7.088165524512387e-07, "logits/chosen": 0.25504761934280396, "logits/rejected": 0.20732422173023224, "logps/chosen": -318.20001220703125, "logps/rejected": -356.04998779296875, "loss": 0.1004, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.737597644329071, "rewards/margins": 4.5390625, "rewards/rejected": -3.8046875, "step": 4420 }, { "epoch": 1.1676331049024775, "grad_norm": 45.473785607928384, "learning_rate": 7.081576172904587e-07, "logits/chosen": 0.268087774515152, "logits/rejected": 0.19773559272289276, "logps/chosen": -351.0, "logps/rejected": -356.1499938964844, "loss": 0.1066, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9067138433456421, "rewards/margins": 4.71484375, "rewards/rejected": -3.8070311546325684, "step": 4430 }, { "epoch": 1.1702688455455983, "grad_norm": 42.904215375525254, "learning_rate": 7.074986821296784e-07, "logits/chosen": 0.27460938692092896, "logits/rejected": 0.19037170708179474, "logps/chosen": -387.1000061035156, "logps/rejected": -392.20001220703125, "loss": 0.1076, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.116796851158142, "rewards/margins": 4.862500190734863, "rewards/rejected": -3.746875047683716, "step": 4440 }, { "epoch": 1.172904586188719, "grad_norm": 29.033350593556644, "learning_rate": 7.068397469688983e-07, "logits/chosen": 0.22431640326976776, "logits/rejected": 0.013140869326889515, "logps/chosen": -366.95001220703125, "logps/rejected": -380.8500061035156, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 1.0477173328399658, "rewards/margins": 4.903906345367432, "rewards/rejected": -3.8539061546325684, "step": 4450 }, { "epoch": 1.1755403268318398, "grad_norm": 21.51893964004847, "learning_rate": 7.061808118081181e-07, "logits/chosen": 0.361907958984375, "logits/rejected": 0.09852294623851776, "logps/chosen": -367.95001220703125, "logps/rejected": -362.5, "loss": 0.1351, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.02099609375, "rewards/margins": 4.470312595367432, "rewards/rejected": -3.453125, "step": 4460 }, { "epoch": 1.1781760674749604, "grad_norm": 59.61434587609833, "learning_rate": 7.055218766473378e-07, "logits/chosen": 0.1920166015625, "logits/rejected": 0.16575928032398224, "logps/chosen": -349.25, "logps/rejected": -359.1499938964844, "loss": 0.1078, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3309326171875, "rewards/margins": 4.525781154632568, "rewards/rejected": -3.192187547683716, "step": 4470 }, { "epoch": 1.1808118081180812, "grad_norm": 46.07837986071755, "learning_rate": 7.048629414865577e-07, "logits/chosen": 0.2814987301826477, "logits/rejected": 0.17572021484375, "logps/chosen": -350.1000061035156, "logps/rejected": -389.3500061035156, "loss": 0.083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.740625023841858, "rewards/margins": 5.110937595367432, "rewards/rejected": -3.3746094703674316, "step": 4480 }, { "epoch": 1.183447548761202, "grad_norm": 13.46186301260293, "learning_rate": 7.042040063257774e-07, "logits/chosen": 0.21455688774585724, "logits/rejected": 0.08641967922449112, "logps/chosen": -336.6000061035156, "logps/rejected": -365.3500061035156, "loss": 0.0622, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.331445336341858, "rewards/margins": 4.931250095367432, "rewards/rejected": -3.5992188453674316, "step": 4490 }, { "epoch": 1.1860832894043227, "grad_norm": 54.83226253519802, "learning_rate": 7.035450711649973e-07, "logits/chosen": 0.03768310695886612, "logits/rejected": -0.1771240234375, "logps/chosen": -359.70001220703125, "logps/rejected": -371.29998779296875, "loss": 0.0762, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.88037109375, "rewards/margins": 4.9375, "rewards/rejected": -4.057031154632568, "step": 4500 }, { "epoch": 1.1887190300474433, "grad_norm": 37.36199347702336, "learning_rate": 7.028861360042172e-07, "logits/chosen": -0.15245971083641052, "logits/rejected": -0.19435425102710724, "logps/chosen": -335.45001220703125, "logps/rejected": -373.79998779296875, "loss": 0.0958, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.45488280057907104, "rewards/margins": 5.342968940734863, "rewards/rejected": -4.88671875, "step": 4510 }, { "epoch": 1.191354770690564, "grad_norm": 12.949895171075141, "learning_rate": 7.02227200843437e-07, "logits/chosen": 0.13163451850414276, "logits/rejected": -0.15529175102710724, "logps/chosen": -334.70001220703125, "logps/rejected": -373.5, "loss": 0.0879, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4611144959926605, "rewards/margins": 4.944531440734863, "rewards/rejected": -4.483593940734863, "step": 4520 }, { "epoch": 1.1939905113336848, "grad_norm": 15.24082825433002, "learning_rate": 7.015682656826568e-07, "logits/chosen": 0.03682861477136612, "logits/rejected": -0.02395019493997097, "logps/chosen": -315.29998779296875, "logps/rejected": -376.8999938964844, "loss": 0.1216, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.100341796875, "rewards/margins": 4.984375, "rewards/rejected": -4.88671875, "step": 4530 }, { "epoch": 1.1966262519768054, "grad_norm": 24.52172120378408, "learning_rate": 7.009093305218766e-07, "logits/chosen": -0.02985839918255806, "logits/rejected": -0.098388671875, "logps/chosen": -326.6499938964844, "logps/rejected": -366.70001220703125, "loss": 0.1055, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10009765625, "rewards/margins": 4.838281154632568, "rewards/rejected": -4.736718654632568, "step": 4540 }, { "epoch": 1.1992619926199262, "grad_norm": 34.3550260040994, "learning_rate": 7.002503953610964e-07, "logits/chosen": 0.15020751953125, "logits/rejected": -0.024505615234375, "logps/chosen": -322.1000061035156, "logps/rejected": -365.25, "loss": 0.0898, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.694775402545929, "rewards/margins": 4.799218654632568, "rewards/rejected": -4.102343559265137, "step": 4550 }, { "epoch": 1.201897733263047, "grad_norm": 31.27165242003128, "learning_rate": 6.995914602003162e-07, "logits/chosen": 0.044403076171875, "logits/rejected": -0.12076415866613388, "logps/chosen": -398.75, "logps/rejected": -417.79998779296875, "loss": 0.0679, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8260742425918579, "rewards/margins": 5.345312595367432, "rewards/rejected": -4.517187595367432, "step": 4560 }, { "epoch": 1.2045334739061677, "grad_norm": 14.578635226792269, "learning_rate": 6.98932525039536e-07, "logits/chosen": -0.02597656287252903, "logits/rejected": -0.21768799424171448, "logps/chosen": -352.6000061035156, "logps/rejected": -391.45001220703125, "loss": 0.0825, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5383850336074829, "rewards/margins": 5.324999809265137, "rewards/rejected": -4.788281440734863, "step": 4570 }, { "epoch": 1.2071692145492883, "grad_norm": 21.546692282230897, "learning_rate": 6.98273589878756e-07, "logits/chosen": -0.0091552734375, "logits/rejected": -0.08544921875, "logps/chosen": -378.70001220703125, "logps/rejected": -401.5, "loss": 0.1222, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.592041015625, "rewards/margins": 5.1796875, "rewards/rejected": -4.587500095367432, "step": 4580 }, { "epoch": 1.209804955192409, "grad_norm": 73.37275991563487, "learning_rate": 6.976146547179757e-07, "logits/chosen": 0.05423583835363388, "logits/rejected": -0.01312255859375, "logps/chosen": -335.0, "logps/rejected": -366.70001220703125, "loss": 0.095, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7913818359375, "rewards/margins": 4.333593845367432, "rewards/rejected": -3.543750047683716, "step": 4590 }, { "epoch": 1.2124406958355298, "grad_norm": 61.425047489406516, "learning_rate": 6.969557195571956e-07, "logits/chosen": 0.13197021186351776, "logits/rejected": -0.10808105766773224, "logps/chosen": -381.70001220703125, "logps/rejected": -351.20001220703125, "loss": 0.0954, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8915771245956421, "rewards/margins": 4.733593940734863, "rewards/rejected": -3.83984375, "step": 4600 }, { "epoch": 1.2150764364786504, "grad_norm": 34.36656006410676, "learning_rate": 6.962967843964153e-07, "logits/chosen": 0.12956543266773224, "logits/rejected": -0.007800293155014515, "logps/chosen": -371.29998779296875, "logps/rejected": -393.79998779296875, "loss": 0.1296, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8606933355331421, "rewards/margins": 5.014062404632568, "rewards/rejected": -4.153906345367432, "step": 4610 }, { "epoch": 1.2177121771217712, "grad_norm": 30.847586804683946, "learning_rate": 6.956378492356352e-07, "logits/chosen": 0.14089354872703552, "logits/rejected": -0.08770751953125, "logps/chosen": -400.3999938964844, "logps/rejected": -403.25, "loss": 0.078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.46601563692092896, "rewards/margins": 5.497656345367432, "rewards/rejected": -5.02734375, "step": 4620 }, { "epoch": 1.220347917764892, "grad_norm": 30.67616279706453, "learning_rate": 6.94978914074855e-07, "logits/chosen": -0.004833984188735485, "logits/rejected": -0.1087646484375, "logps/chosen": -333.6499938964844, "logps/rejected": -355.20001220703125, "loss": 0.1108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5796874761581421, "rewards/margins": 4.879687309265137, "rewards/rejected": -4.302343845367432, "step": 4630 }, { "epoch": 1.2229836584080127, "grad_norm": 47.74831618764026, "learning_rate": 6.943199789140748e-07, "logits/chosen": 0.1544189453125, "logits/rejected": 0.05771484225988388, "logps/chosen": -366.79998779296875, "logps/rejected": -364.25, "loss": 0.08, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.725634753704071, "rewards/margins": 4.905468940734863, "rewards/rejected": -4.178906440734863, "step": 4640 }, { "epoch": 1.2256193990511335, "grad_norm": 53.42941987260713, "learning_rate": 6.936610437532946e-07, "logits/chosen": 0.3246215879917145, "logits/rejected": 0.3170410096645355, "logps/chosen": -355.5, "logps/rejected": -365.0, "loss": 0.1247, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.846264660358429, "rewards/margins": 4.617968559265137, "rewards/rejected": -3.76953125, "step": 4650 }, { "epoch": 1.228255139694254, "grad_norm": 8.555798317315567, "learning_rate": 6.930021085925145e-07, "logits/chosen": 0.2635253965854645, "logits/rejected": 0.08320923149585724, "logps/chosen": -352.6499938964844, "logps/rejected": -363.20001220703125, "loss": 0.1, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.879101574420929, "rewards/margins": 5.147656440734863, "rewards/rejected": -4.272656440734863, "step": 4660 }, { "epoch": 1.2308908803373748, "grad_norm": 39.12662357360214, "learning_rate": 6.923431734317343e-07, "logits/chosen": 0.18761596083641052, "logits/rejected": 0.01954803429543972, "logps/chosen": -334.8500061035156, "logps/rejected": -342.8999938964844, "loss": 0.1172, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.986621081829071, "rewards/margins": 4.7109375, "rewards/rejected": -3.7242188453674316, "step": 4670 }, { "epoch": 1.2335266209804956, "grad_norm": 14.28724752260881, "learning_rate": 6.916842382709542e-07, "logits/chosen": 0.18305663764476776, "logits/rejected": 0.12093429267406464, "logps/chosen": -412.6000061035156, "logps/rejected": -464.29998779296875, "loss": 0.0559, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.9693847894668579, "rewards/margins": 5.473437309265137, "rewards/rejected": -4.50390625, "step": 4680 }, { "epoch": 1.2361623616236161, "grad_norm": 30.54233191880376, "learning_rate": 6.910253031101739e-07, "logits/chosen": 0.114044189453125, "logits/rejected": 0.04034423828125, "logps/chosen": -352.6000061035156, "logps/rejected": -375.04998779296875, "loss": 0.1357, "rewards/accuracies": 0.9375, "rewards/chosen": 0.708300769329071, "rewards/margins": 5.087500095367432, "rewards/rejected": -4.380468845367432, "step": 4690 }, { "epoch": 1.238798102266737, "grad_norm": 5.794751442224136, "learning_rate": 6.903663679493938e-07, "logits/chosen": 0.24383544921875, "logits/rejected": 0.04224243015050888, "logps/chosen": -380.1000061035156, "logps/rejected": -384.54998779296875, "loss": 0.0939, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.646148681640625, "rewards/margins": 5.17578125, "rewards/rejected": -4.532812595367432, "step": 4700 }, { "epoch": 1.2414338429098577, "grad_norm": 52.283336277327706, "learning_rate": 6.897074327886135e-07, "logits/chosen": 0.04935302585363388, "logits/rejected": 0.02467041090130806, "logps/chosen": -323.82501220703125, "logps/rejected": -357.3500061035156, "loss": 0.1576, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.528210461139679, "rewards/margins": 4.904687404632568, "rewards/rejected": -4.375781059265137, "step": 4710 }, { "epoch": 1.2440695835529785, "grad_norm": 21.17001200478512, "learning_rate": 6.890484976278334e-07, "logits/chosen": 0.21837158501148224, "logits/rejected": 0.09243164211511612, "logps/chosen": -341.3999938964844, "logps/rejected": -381.5, "loss": 0.0652, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.109686255455017, "rewards/margins": 4.872656345367432, "rewards/rejected": -3.758593797683716, "step": 4720 }, { "epoch": 1.246705324196099, "grad_norm": 91.00751935286566, "learning_rate": 6.883895624670533e-07, "logits/chosen": 0.14035645127296448, "logits/rejected": 0.09340210258960724, "logps/chosen": -427.79998779296875, "logps/rejected": -418.25, "loss": 0.0966, "rewards/accuracies": 0.96875, "rewards/chosen": 0.523205578327179, "rewards/margins": 5.504687309265137, "rewards/rejected": -4.978125095367432, "step": 4730 }, { "epoch": 1.2493410648392198, "grad_norm": 24.325166304360394, "learning_rate": 6.877306273062731e-07, "logits/chosen": 0.18385925889015198, "logits/rejected": -0.01351318322122097, "logps/chosen": -394.45001220703125, "logps/rejected": -379.70001220703125, "loss": 0.1066, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8114258050918579, "rewards/margins": 5.092968940734863, "rewards/rejected": -4.282812595367432, "step": 4740 }, { "epoch": 1.2519768054823406, "grad_norm": 36.959321950980566, "learning_rate": 6.870716921454929e-07, "logits/chosen": 0.3134750425815582, "logits/rejected": 0.206756591796875, "logps/chosen": -342.29998779296875, "logps/rejected": -372.5, "loss": 0.0871, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8031860589981079, "rewards/margins": 4.705468654632568, "rewards/rejected": -3.905468702316284, "step": 4750 }, { "epoch": 1.2546125461254611, "grad_norm": 11.517226254032863, "learning_rate": 6.864127569847127e-07, "logits/chosen": 0.3131347596645355, "logits/rejected": 0.22187499701976776, "logps/chosen": -362.0, "logps/rejected": -392.70001220703125, "loss": 0.0758, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.126562476158142, "rewards/margins": 5.176562309265137, "rewards/rejected": -4.046093940734863, "step": 4760 }, { "epoch": 1.257248286768582, "grad_norm": 34.41951438805259, "learning_rate": 6.857538218239325e-07, "logits/chosen": 0.11249389499425888, "logits/rejected": 0.02458648756146431, "logps/chosen": -327.75, "logps/rejected": -354.5, "loss": 0.0877, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1461913585662842, "rewards/margins": 4.961718559265137, "rewards/rejected": -3.8128905296325684, "step": 4770 }, { "epoch": 1.2598840274117027, "grad_norm": 14.983403313783088, "learning_rate": 6.850948866631523e-07, "logits/chosen": 0.12849731743335724, "logits/rejected": -0.0098876953125, "logps/chosen": -314.0, "logps/rejected": -359.25, "loss": 0.0979, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5853271484375, "rewards/margins": 4.833593845367432, "rewards/rejected": -4.243750095367432, "step": 4780 }, { "epoch": 1.2625197680548235, "grad_norm": 25.169757155815542, "learning_rate": 6.844359515023721e-07, "logits/chosen": -0.01584167405962944, "logits/rejected": 0.04005737230181694, "logps/chosen": -364.04998779296875, "logps/rejected": -406.5, "loss": 0.0964, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8499511480331421, "rewards/margins": 4.964062690734863, "rewards/rejected": -4.115624904632568, "step": 4790 }, { "epoch": 1.2651555086979442, "grad_norm": 59.04128382841593, "learning_rate": 6.83777016341592e-07, "logits/chosen": 0.14741210639476776, "logits/rejected": 0.08138427883386612, "logps/chosen": -377.54998779296875, "logps/rejected": -390.95001220703125, "loss": 0.1046, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.9654785394668579, "rewards/margins": 5.216406345367432, "rewards/rejected": -4.246874809265137, "step": 4800 }, { "epoch": 1.2677912493410648, "grad_norm": 15.930121296997916, "learning_rate": 6.831180811808118e-07, "logits/chosen": 0.22663573920726776, "logits/rejected": -0.08249511569738388, "logps/chosen": -399.6000061035156, "logps/rejected": -407.8500061035156, "loss": 0.0542, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.096704125404358, "rewards/margins": 5.557812690734863, "rewards/rejected": -4.46875, "step": 4810 }, { "epoch": 1.2704269899841856, "grad_norm": 36.60262455269198, "learning_rate": 6.824591460200317e-07, "logits/chosen": 0.07129821926355362, "logits/rejected": -0.02918701246380806, "logps/chosen": -405.79998779296875, "logps/rejected": -407.70001220703125, "loss": 0.0893, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9641357660293579, "rewards/margins": 5.255468845367432, "rewards/rejected": -4.295312404632568, "step": 4820 }, { "epoch": 1.2730627306273063, "grad_norm": 31.233254476310186, "learning_rate": 6.818002108592514e-07, "logits/chosen": 0.16089019179344177, "logits/rejected": -0.04051513597369194, "logps/chosen": -377.29998779296875, "logps/rejected": -385.6000061035156, "loss": 0.1304, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.557568371295929, "rewards/margins": 4.841406345367432, "rewards/rejected": -4.2890625, "step": 4830 }, { "epoch": 1.275698471270427, "grad_norm": 32.189662079170844, "learning_rate": 6.811412756984712e-07, "logits/chosen": 0.03913574293255806, "logits/rejected": -0.07808838039636612, "logps/chosen": -409.25, "logps/rejected": -409.45001220703125, "loss": 0.0725, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24736328423023224, "rewards/margins": 5.063281059265137, "rewards/rejected": -4.814062595367432, "step": 4840 }, { "epoch": 1.2783342119135477, "grad_norm": 49.0940828919231, "learning_rate": 6.804823405376911e-07, "logits/chosen": 0.11423339694738388, "logits/rejected": -0.15858764946460724, "logps/chosen": -395.3999938964844, "logps/rejected": -397.1499938964844, "loss": 0.0777, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2861084043979645, "rewards/margins": 5.787499904632568, "rewards/rejected": -5.50390625, "step": 4850 }, { "epoch": 1.2809699525566685, "grad_norm": 62.339195922410646, "learning_rate": 6.798234053769108e-07, "logits/chosen": -0.17901611328125, "logits/rejected": -0.1179962158203125, "logps/chosen": -372.29998779296875, "logps/rejected": -415.8999938964844, "loss": 0.0941, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6331787109375, "rewards/margins": 5.124218940734863, "rewards/rejected": -5.75, "step": 4860 }, { "epoch": 1.2836056931997892, "grad_norm": 27.546332881843302, "learning_rate": 6.791644702161307e-07, "logits/chosen": 0.06462402641773224, "logits/rejected": -0.10135497897863388, "logps/chosen": -367.20001220703125, "logps/rejected": -395.54998779296875, "loss": 0.0865, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.1910400390625, "rewards/margins": 5.157031059265137, "rewards/rejected": -5.34765625, "step": 4870 }, { "epoch": 1.2862414338429098, "grad_norm": 6.465864878114875, "learning_rate": 6.785055350553505e-07, "logits/chosen": 0.08305053412914276, "logits/rejected": -0.06292114406824112, "logps/chosen": -337.6499938964844, "logps/rejected": -409.0, "loss": 0.1331, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.02607421949505806, "rewards/margins": 5.190625190734863, "rewards/rejected": -5.162499904632568, "step": 4880 }, { "epoch": 1.2888771744860306, "grad_norm": 18.356687762447187, "learning_rate": 6.778465998945704e-07, "logits/chosen": 0.19068603217601776, "logits/rejected": 0.06667480617761612, "logps/chosen": -374.67498779296875, "logps/rejected": -431.5, "loss": 0.0738, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.41259765625, "rewards/margins": 5.733593940734863, "rewards/rejected": -5.322656154632568, "step": 4890 }, { "epoch": 1.2915129151291513, "grad_norm": 21.898746658529785, "learning_rate": 6.771876647337901e-07, "logits/chosen": 0.03434448316693306, "logits/rejected": -0.03725586086511612, "logps/chosen": -358.95001220703125, "logps/rejected": -415.70001220703125, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 0.5296996831893921, "rewards/margins": 5.561718940734863, "rewards/rejected": -5.032812595367432, "step": 4900 }, { "epoch": 1.294148655772272, "grad_norm": 40.91416019539498, "learning_rate": 6.7652872957301e-07, "logits/chosen": 0.08124389499425888, "logits/rejected": -0.0113372802734375, "logps/chosen": -367.6499938964844, "logps/rejected": -417.8999938964844, "loss": 0.1097, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.022363305091858, "rewards/margins": 5.520312309265137, "rewards/rejected": -4.501562595367432, "step": 4910 }, { "epoch": 1.2967843964153927, "grad_norm": 75.57505261696649, "learning_rate": 6.758697944122298e-07, "logits/chosen": 0.16577759385108948, "logits/rejected": 0.06364746391773224, "logps/chosen": -362.79998779296875, "logps/rejected": -372.25, "loss": 0.1126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7887207269668579, "rewards/margins": 5.382031440734863, "rewards/rejected": -4.592968940734863, "step": 4920 }, { "epoch": 1.2994201370585134, "grad_norm": 78.27061788120784, "learning_rate": 6.752108592514496e-07, "logits/chosen": 0.09755859524011612, "logits/rejected": 0.01190185546875, "logps/chosen": -414.5, "logps/rejected": -394.29998779296875, "loss": 0.1354, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.4504028260707855, "rewards/margins": 5.039843559265137, "rewards/rejected": -4.589062690734863, "step": 4930 }, { "epoch": 1.3020558777016342, "grad_norm": 22.798086337781456, "learning_rate": 6.745519240906694e-07, "logits/chosen": 0.25273436307907104, "logits/rejected": 0.024658203125, "logps/chosen": -331.5, "logps/rejected": -355.25, "loss": 0.1013, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9371582269668579, "rewards/margins": 4.440625190734863, "rewards/rejected": -3.503124952316284, "step": 4940 }, { "epoch": 1.304691618344755, "grad_norm": 7.523574068635711, "learning_rate": 6.738929889298892e-07, "logits/chosen": 0.1360832154750824, "logits/rejected": -0.05363769456744194, "logps/chosen": -373.1499938964844, "logps/rejected": -369.29998779296875, "loss": 0.0705, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9501953125, "rewards/margins": 5.330468654632568, "rewards/rejected": -4.383593559265137, "step": 4950 }, { "epoch": 1.3073273589878756, "grad_norm": 10.945442365856618, "learning_rate": 6.732340537691091e-07, "logits/chosen": 0.22418823838233948, "logits/rejected": -0.011547851376235485, "logps/chosen": -383.6000061035156, "logps/rejected": -397.3999938964844, "loss": 0.0757, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8661254644393921, "rewards/margins": 5.125, "rewards/rejected": -4.262499809265137, "step": 4960 }, { "epoch": 1.3099630996309963, "grad_norm": 26.06918383275203, "learning_rate": 6.72575118608329e-07, "logits/chosen": 0.307037353515625, "logits/rejected": 0.21328124403953552, "logps/chosen": -367.29998779296875, "logps/rejected": -392.79998779296875, "loss": 0.0896, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7442382574081421, "rewards/margins": 5.025000095367432, "rewards/rejected": -4.278906345367432, "step": 4970 }, { "epoch": 1.312598840274117, "grad_norm": 153.9455080915411, "learning_rate": 6.719161834475487e-07, "logits/chosen": 0.11466064304113388, "logits/rejected": 0.11806640774011612, "logps/chosen": -349.8999938964844, "logps/rejected": -398.70001220703125, "loss": 0.1035, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3927368223667145, "rewards/margins": 4.917187690734863, "rewards/rejected": -4.525000095367432, "step": 4980 }, { "epoch": 1.3152345809172377, "grad_norm": 37.95908929910163, "learning_rate": 6.712572482867686e-07, "logits/chosen": 0.1676177978515625, "logits/rejected": -0.005322265438735485, "logps/chosen": -394.04998779296875, "logps/rejected": -371.54998779296875, "loss": 0.0546, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5040527582168579, "rewards/margins": 5.16015625, "rewards/rejected": -4.657031059265137, "step": 4990 }, { "epoch": 1.3178703215603584, "grad_norm": 20.636366726244383, "learning_rate": 6.705983131259883e-07, "logits/chosen": 0.01652831956744194, "logits/rejected": -0.151580810546875, "logps/chosen": -373.79998779296875, "logps/rejected": -399.45001220703125, "loss": 0.0748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.411376953125, "rewards/margins": 5.310937404632568, "rewards/rejected": -4.900000095367432, "step": 5000 }, { "epoch": 1.3205060622034792, "grad_norm": 23.489851495922178, "learning_rate": 6.699393779652082e-07, "logits/chosen": -0.041259765625, "logits/rejected": -0.0006103515625, "logps/chosen": -366.8999938964844, "logps/rejected": -387.1499938964844, "loss": 0.086, "rewards/accuracies": 0.96875, "rewards/chosen": 0.01535644568502903, "rewards/margins": 5.085156440734863, "rewards/rejected": -5.067968845367432, "step": 5010 }, { "epoch": 1.3231418028466, "grad_norm": 31.006651360958912, "learning_rate": 6.69280442804428e-07, "logits/chosen": 0.122406005859375, "logits/rejected": -0.12557831406593323, "logps/chosen": -406.20001220703125, "logps/rejected": -409.20001220703125, "loss": 0.101, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4771972596645355, "rewards/margins": 5.120312690734863, "rewards/rejected": -4.645312309265137, "step": 5020 }, { "epoch": 1.3257775434897205, "grad_norm": 31.65431929426308, "learning_rate": 6.686215076436479e-07, "logits/chosen": -0.0045410157181322575, "logits/rejected": -0.09049072116613388, "logps/chosen": -375.6499938964844, "logps/rejected": -391.0, "loss": 0.104, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.782763659954071, "rewards/margins": 5.083593845367432, "rewards/rejected": -4.301562309265137, "step": 5030 }, { "epoch": 1.3284132841328413, "grad_norm": 6.172447397020573, "learning_rate": 6.679625724828677e-07, "logits/chosen": 0.32740479707717896, "logits/rejected": 0.12241210788488388, "logps/chosen": -400.3999938964844, "logps/rejected": -409.3999938964844, "loss": 0.0725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5640929937362671, "rewards/margins": 5.299218654632568, "rewards/rejected": -4.736718654632568, "step": 5040 }, { "epoch": 1.331049024775962, "grad_norm": 31.42201956473299, "learning_rate": 6.673036373220875e-07, "logits/chosen": 0.16318359971046448, "logits/rejected": 0.03081665001809597, "logps/chosen": -349.8999938964844, "logps/rejected": -394.04998779296875, "loss": 0.0761, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4312500059604645, "rewards/margins": 5.287499904632568, "rewards/rejected": -4.854687690734863, "step": 5050 }, { "epoch": 1.3336847654190827, "grad_norm": 31.02074417853579, "learning_rate": 6.666447021613073e-07, "logits/chosen": 0.19493408501148224, "logits/rejected": 0.02902832068502903, "logps/chosen": -358.6499938964844, "logps/rejected": -377.1000061035156, "loss": 0.1177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4000793397426605, "rewards/margins": 5.050000190734863, "rewards/rejected": -4.6484375, "step": 5060 }, { "epoch": 1.3363205060622034, "grad_norm": 44.41996127441396, "learning_rate": 6.659857670005271e-07, "logits/chosen": 0.3562255799770355, "logits/rejected": 0.24282684922218323, "logps/chosen": -347.54998779296875, "logps/rejected": -369.29998779296875, "loss": 0.0973, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0021483898162842, "rewards/margins": 4.957812309265137, "rewards/rejected": -3.953906297683716, "step": 5070 }, { "epoch": 1.3389562467053242, "grad_norm": 30.485844820661715, "learning_rate": 6.653268318397469e-07, "logits/chosen": 0.27552491426467896, "logits/rejected": 0.06253661960363388, "logps/chosen": -342.75, "logps/rejected": -350.29998779296875, "loss": 0.107, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.069482445716858, "rewards/margins": 4.555468559265137, "rewards/rejected": -3.4898438453674316, "step": 5080 }, { "epoch": 1.341591987348445, "grad_norm": 62.92456346171141, "learning_rate": 6.646678966789668e-07, "logits/chosen": 0.19041137397289276, "logits/rejected": 0.0384521484375, "logps/chosen": -373.8999938964844, "logps/rejected": -385.29998779296875, "loss": 0.0997, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.883544921875, "rewards/margins": 5.05078125, "rewards/rejected": -4.171875, "step": 5090 }, { "epoch": 1.3442277279915658, "grad_norm": 10.691493615527628, "learning_rate": 6.640089615181865e-07, "logits/chosen": 0.08544311672449112, "logits/rejected": -0.02187499962747097, "logps/chosen": -369.1000061035156, "logps/rejected": -376.45001220703125, "loss": 0.093, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4357666075229645, "rewards/margins": 5.007031440734863, "rewards/rejected": -4.572656154632568, "step": 5100 }, { "epoch": 1.3468634686346863, "grad_norm": 7.629809054761935, "learning_rate": 6.633500263574065e-07, "logits/chosen": 0.16594238579273224, "logits/rejected": -0.02686767652630806, "logps/chosen": -363.0, "logps/rejected": -372.79998779296875, "loss": 0.1156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05563964694738388, "rewards/margins": 4.610937595367432, "rewards/rejected": -4.551562309265137, "step": 5110 }, { "epoch": 1.349499209277807, "grad_norm": 10.596490987251025, "learning_rate": 6.626910911966262e-07, "logits/chosen": 0.06320800632238388, "logits/rejected": 0.009265136905014515, "logps/chosen": -372.1000061035156, "logps/rejected": -411.3999938964844, "loss": 0.1052, "rewards/accuracies": 0.9375, "rewards/chosen": 0.278189092874527, "rewards/margins": 4.778124809265137, "rewards/rejected": -4.496874809265137, "step": 5120 }, { "epoch": 1.3521349499209276, "grad_norm": 29.019283288687127, "learning_rate": 6.620321560358461e-07, "logits/chosen": 0.11346435546875, "logits/rejected": 0.11401977390050888, "logps/chosen": -364.79998779296875, "logps/rejected": -397.5, "loss": 0.0827, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5580383539199829, "rewards/margins": 5.352343559265137, "rewards/rejected": -4.7890625, "step": 5130 }, { "epoch": 1.3547706905640484, "grad_norm": 20.157037197390068, "learning_rate": 6.613732208750659e-07, "logits/chosen": 0.03079834021627903, "logits/rejected": -0.04580078274011612, "logps/chosen": -349.25, "logps/rejected": -402.1000061035156, "loss": 0.1037, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.692578136920929, "rewards/margins": 5.224218845367432, "rewards/rejected": -4.529687404632568, "step": 5140 }, { "epoch": 1.3574064312071692, "grad_norm": 33.938501597307706, "learning_rate": 6.607142857142857e-07, "logits/chosen": 0.042724609375, "logits/rejected": -0.14879760146141052, "logps/chosen": -385.3999938964844, "logps/rejected": -412.1499938964844, "loss": 0.0581, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.47454833984375, "rewards/margins": 5.492968559265137, "rewards/rejected": -5.012499809265137, "step": 5150 }, { "epoch": 1.36004217185029, "grad_norm": 10.38746049320749, "learning_rate": 6.600553505535055e-07, "logits/chosen": 0.010162353515625, "logits/rejected": -0.05315551906824112, "logps/chosen": -382.79998779296875, "logps/rejected": -391.79998779296875, "loss": 0.1091, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5044800043106079, "rewards/margins": 5.296093940734863, "rewards/rejected": -4.795312404632568, "step": 5160 }, { "epoch": 1.3626779124934107, "grad_norm": 45.184616104435115, "learning_rate": 6.593964153927252e-07, "logits/chosen": 0.03945312649011612, "logits/rejected": -0.09421996772289276, "logps/chosen": -355.04998779296875, "logps/rejected": -374.6000061035156, "loss": 0.1009, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6296631097793579, "rewards/margins": 5.221093654632568, "rewards/rejected": -4.591406345367432, "step": 5170 }, { "epoch": 1.3653136531365313, "grad_norm": 44.59495574632941, "learning_rate": 6.587374802319452e-07, "logits/chosen": 0.21981200575828552, "logits/rejected": 0.08428955078125, "logps/chosen": -336.8999938964844, "logps/rejected": -385.3500061035156, "loss": 0.11, "rewards/accuracies": 0.96875, "rewards/chosen": 0.21916504204273224, "rewards/margins": 4.725781440734863, "rewards/rejected": -4.507031440734863, "step": 5180 }, { "epoch": 1.367949393779652, "grad_norm": 30.344645519389413, "learning_rate": 6.58078545071165e-07, "logits/chosen": 0.05832214280962944, "logits/rejected": -0.05391235277056694, "logps/chosen": -380.70001220703125, "logps/rejected": -417.04998779296875, "loss": 0.1047, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04615478590130806, "rewards/margins": 5.430468559265137, "rewards/rejected": -5.383593559265137, "step": 5190 }, { "epoch": 1.3705851344227729, "grad_norm": 29.372247293585893, "learning_rate": 6.574196099103848e-07, "logits/chosen": -0.0728912353515625, "logits/rejected": -0.25764161348342896, "logps/chosen": -372.1000061035156, "logps/rejected": -378.70001220703125, "loss": 0.0801, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.3025878965854645, "rewards/margins": 5.395312309265137, "rewards/rejected": -5.09375, "step": 5200 }, { "epoch": 1.3732208750658934, "grad_norm": 42.653735441493275, "learning_rate": 6.567606747496046e-07, "logits/chosen": 0.15913085639476776, "logits/rejected": 0.064056396484375, "logps/chosen": -344.20001220703125, "logps/rejected": -381.6499938964844, "loss": 0.0957, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.34833985567092896, "rewards/margins": 4.82421875, "rewards/rejected": -4.479687690734863, "step": 5210 }, { "epoch": 1.3758566157090142, "grad_norm": 16.17641046463398, "learning_rate": 6.561017395888244e-07, "logits/chosen": 0.04368286207318306, "logits/rejected": -0.03680114820599556, "logps/chosen": -351.54998779296875, "logps/rejected": -361.95001220703125, "loss": 0.0986, "rewards/accuracies": 0.96875, "rewards/chosen": 0.31800538301467896, "rewards/margins": 4.954687595367432, "rewards/rejected": -4.63671875, "step": 5220 }, { "epoch": 1.378492356352135, "grad_norm": 21.948604455676847, "learning_rate": 6.554428044280442e-07, "logits/chosen": 0.14208069443702698, "logits/rejected": -0.02338867262005806, "logps/chosen": -370.20001220703125, "logps/rejected": -415.1000061035156, "loss": 0.0662, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6751800775527954, "rewards/margins": 4.964062690734863, "rewards/rejected": -4.290625095367432, "step": 5230 }, { "epoch": 1.3811280969952557, "grad_norm": 57.55273076667655, "learning_rate": 6.54783869267264e-07, "logits/chosen": 0.27308350801467896, "logits/rejected": 0.027984619140625, "logps/chosen": -385.6499938964844, "logps/rejected": -417.45001220703125, "loss": 0.1119, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6133056879043579, "rewards/margins": 5.615624904632568, "rewards/rejected": -5.000781059265137, "step": 5240 }, { "epoch": 1.3837638376383765, "grad_norm": 34.107832393364156, "learning_rate": 6.541249341064838e-07, "logits/chosen": -0.03952636569738388, "logits/rejected": -0.06430663913488388, "logps/chosen": -325.6000061035156, "logps/rejected": -372.25, "loss": 0.0909, "rewards/accuracies": 0.96875, "rewards/chosen": 0.03449096530675888, "rewards/margins": 5.123437404632568, "rewards/rejected": -5.085156440734863, "step": 5250 }, { "epoch": 1.386399578281497, "grad_norm": 33.226388251383256, "learning_rate": 6.534659989457038e-07, "logits/chosen": 0.036041259765625, "logits/rejected": -0.10207519680261612, "logps/chosen": -379.79998779296875, "logps/rejected": -396.70001220703125, "loss": 0.1243, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30695801973342896, "rewards/margins": 5.096875190734863, "rewards/rejected": -4.787499904632568, "step": 5260 }, { "epoch": 1.3890353189246178, "grad_norm": 98.57523395124232, "learning_rate": 6.528070637849235e-07, "logits/chosen": -0.02623290941119194, "logits/rejected": -0.05019836500287056, "logps/chosen": -330.0, "logps/rejected": -373.79998779296875, "loss": 0.1287, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.593188464641571, "rewards/margins": 4.827734470367432, "rewards/rejected": -4.234765529632568, "step": 5270 }, { "epoch": 1.3916710595677384, "grad_norm": 22.886408332918162, "learning_rate": 6.521481286241434e-07, "logits/chosen": 0.19169922173023224, "logits/rejected": -0.02605590783059597, "logps/chosen": -342.79998779296875, "logps/rejected": -378.75, "loss": 0.0725, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5200439691543579, "rewards/margins": 4.795312404632568, "rewards/rejected": -4.275000095367432, "step": 5280 }, { "epoch": 1.3943068002108592, "grad_norm": 19.53306857927286, "learning_rate": 6.514891934633631e-07, "logits/chosen": 0.19133301079273224, "logits/rejected": 0.022353744134306908, "logps/chosen": -352.70001220703125, "logps/rejected": -412.29998779296875, "loss": 0.1172, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.629382312297821, "rewards/margins": 5.058203220367432, "rewards/rejected": -4.430078029632568, "step": 5290 }, { "epoch": 1.39694254085398, "grad_norm": 32.00367329585916, "learning_rate": 6.50830258302583e-07, "logits/chosen": 0.20227661728858948, "logits/rejected": -0.03413086012005806, "logps/chosen": -349.20001220703125, "logps/rejected": -350.54998779296875, "loss": 0.0969, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7038818597793579, "rewards/margins": 4.612500190734863, "rewards/rejected": -3.9078125953674316, "step": 5300 }, { "epoch": 1.3995782814971007, "grad_norm": 19.883571045013543, "learning_rate": 6.501713231418028e-07, "logits/chosen": 0.14589843153953552, "logits/rejected": 0.09474487602710724, "logps/chosen": -375.0, "logps/rejected": -426.79998779296875, "loss": 0.053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.8713623285293579, "rewards/margins": 5.754687309265137, "rewards/rejected": -4.879687309265137, "step": 5310 }, { "epoch": 1.4022140221402215, "grad_norm": 35.149934758159645, "learning_rate": 6.495123879810226e-07, "logits/chosen": 0.05823669582605362, "logits/rejected": -0.10626220703125, "logps/chosen": -377.0, "logps/rejected": -379.5, "loss": 0.1135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3837036192417145, "rewards/margins": 5.532031059265137, "rewards/rejected": -5.146874904632568, "step": 5320 }, { "epoch": 1.404849762783342, "grad_norm": 23.784699350943157, "learning_rate": 6.488534528202425e-07, "logits/chosen": 0.24164429306983948, "logits/rejected": 0.0103759765625, "logps/chosen": -388.3999938964844, "logps/rejected": -344.75, "loss": 0.1203, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.762341320514679, "rewards/margins": 4.4921875, "rewards/rejected": -3.727343797683716, "step": 5330 }, { "epoch": 1.4074855034264628, "grad_norm": 43.07091669620202, "learning_rate": 6.481945176594623e-07, "logits/chosen": 0.21336670219898224, "logits/rejected": 0.1055908203125, "logps/chosen": -343.3999938964844, "logps/rejected": -380.3500061035156, "loss": 0.127, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.627185046672821, "rewards/margins": 5.092968940734863, "rewards/rejected": -4.469531059265137, "step": 5340 }, { "epoch": 1.4101212440695836, "grad_norm": 32.450724238834034, "learning_rate": 6.475355824986821e-07, "logits/chosen": 0.26829832792282104, "logits/rejected": 0.10840453952550888, "logps/chosen": -406.1000061035156, "logps/rejected": -417.70001220703125, "loss": 0.0666, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.665087878704071, "rewards/margins": 5.379687309265137, "rewards/rejected": -4.71484375, "step": 5350 }, { "epoch": 1.4127569847127042, "grad_norm": 43.07367915407378, "learning_rate": 6.46876647337902e-07, "logits/chosen": 0.22700805962085724, "logits/rejected": 0.0017578124534338713, "logps/chosen": -412.1000061035156, "logps/rejected": -391.1000061035156, "loss": 0.144, "rewards/accuracies": 0.9375, "rewards/chosen": 0.34379881620407104, "rewards/margins": 5.035937309265137, "rewards/rejected": -4.689843654632568, "step": 5360 }, { "epoch": 1.415392725355825, "grad_norm": 30.87281178453081, "learning_rate": 6.462177121771217e-07, "logits/chosen": 0.09658203274011612, "logits/rejected": -0.013195800594985485, "logps/chosen": -393.04998779296875, "logps/rejected": -380.54998779296875, "loss": 0.1168, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.07404784858226776, "rewards/margins": 4.94921875, "rewards/rejected": -4.875, "step": 5370 }, { "epoch": 1.4180284659989457, "grad_norm": 10.276618467747165, "learning_rate": 6.455587770163416e-07, "logits/chosen": 0.18693847954273224, "logits/rejected": 0.12669067084789276, "logps/chosen": -346.6499938964844, "logps/rejected": -408.95001220703125, "loss": 0.0882, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09183349460363388, "rewards/margins": 5.227343559265137, "rewards/rejected": -5.321093559265137, "step": 5380 }, { "epoch": 1.4206642066420665, "grad_norm": 51.58383082272025, "learning_rate": 6.448998418555613e-07, "logits/chosen": 0.09511718899011612, "logits/rejected": 0.04514770582318306, "logps/chosen": -346.70001220703125, "logps/rejected": -385.8999938964844, "loss": 0.1174, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.07109375298023224, "rewards/margins": 4.7890625, "rewards/rejected": -4.723437309265137, "step": 5390 }, { "epoch": 1.4232999472851873, "grad_norm": 29.601799914880875, "learning_rate": 6.442409066947812e-07, "logits/chosen": 0.18705444037914276, "logits/rejected": -0.03972167894244194, "logps/chosen": -388.79998779296875, "logps/rejected": -393.6000061035156, "loss": 0.1161, "rewards/accuracies": 0.96875, "rewards/chosen": 0.758990466594696, "rewards/margins": 5.416406154632568, "rewards/rejected": -4.658593654632568, "step": 5400 }, { "epoch": 1.4259356879283078, "grad_norm": 22.9281150063431, "learning_rate": 6.43581971534001e-07, "logits/chosen": 0.2911926209926605, "logits/rejected": 0.12528686225414276, "logps/chosen": -363.1000061035156, "logps/rejected": -366.75, "loss": 0.0977, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.792590320110321, "rewards/margins": 5.092968940734863, "rewards/rejected": -4.300000190734863, "step": 5410 }, { "epoch": 1.4285714285714286, "grad_norm": 43.59210073792613, "learning_rate": 6.429230363732209e-07, "logits/chosen": 0.2804199159145355, "logits/rejected": 0.13047485053539276, "logps/chosen": -368.45001220703125, "logps/rejected": -362.25, "loss": 0.1088, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.80535888671875, "rewards/margins": 4.833593845367432, "rewards/rejected": -4.02734375, "step": 5420 }, { "epoch": 1.4312071692145492, "grad_norm": 32.03260807037076, "learning_rate": 6.422641012124407e-07, "logits/chosen": 0.24730224907398224, "logits/rejected": 0.04517211765050888, "logps/chosen": -374.5, "logps/rejected": -371.6499938964844, "loss": 0.103, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.651928722858429, "rewards/margins": 4.752343654632568, "rewards/rejected": -4.098828315734863, "step": 5430 }, { "epoch": 1.43384290985767, "grad_norm": 8.196394984870263, "learning_rate": 6.416051660516605e-07, "logits/chosen": 0.26182860136032104, "logits/rejected": 0.01924438402056694, "logps/chosen": -394.29998779296875, "logps/rejected": -425.1000061035156, "loss": 0.0767, "rewards/accuracies": 0.96875, "rewards/chosen": 0.53973388671875, "rewards/margins": 5.724999904632568, "rewards/rejected": -5.181250095367432, "step": 5440 }, { "epoch": 1.4364786505007907, "grad_norm": 52.497028714522244, "learning_rate": 6.409462308908803e-07, "logits/chosen": 0.18576660752296448, "logits/rejected": -0.020843505859375, "logps/chosen": -376.6000061035156, "logps/rejected": -373.25, "loss": 0.1027, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.548901379108429, "rewards/margins": 5.018750190734863, "rewards/rejected": -4.475781440734863, "step": 5450 }, { "epoch": 1.4391143911439115, "grad_norm": 58.79949082740601, "learning_rate": 6.402872957301001e-07, "logits/chosen": 0.07954101264476776, "logits/rejected": -0.07764892280101776, "logps/chosen": -359.6000061035156, "logps/rejected": -379.8999938964844, "loss": 0.1162, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4389282166957855, "rewards/margins": 4.965624809265137, "rewards/rejected": -4.52734375, "step": 5460 }, { "epoch": 1.4417501317870323, "grad_norm": 8.384864457080837, "learning_rate": 6.396283605693199e-07, "logits/chosen": 0.32864075899124146, "logits/rejected": -0.005978393368422985, "logps/chosen": -325.1000061035156, "logps/rejected": -361.25, "loss": 0.0589, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.8646240234375, "rewards/margins": 5.107031345367432, "rewards/rejected": -4.250781059265137, "step": 5470 }, { "epoch": 1.4443858724301528, "grad_norm": 49.679389435583055, "learning_rate": 6.389694254085399e-07, "logits/chosen": 0.29448240995407104, "logits/rejected": 0.15312500298023224, "logps/chosen": -369.3500061035156, "logps/rejected": -385.5, "loss": 0.0952, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.547534167766571, "rewards/margins": 5.130468845367432, "rewards/rejected": -4.587500095367432, "step": 5480 }, { "epoch": 1.4470216130732736, "grad_norm": 43.352671309752594, "learning_rate": 6.383104902477596e-07, "logits/chosen": 0.14279785752296448, "logits/rejected": 0.04599609225988388, "logps/chosen": -359.29998779296875, "logps/rejected": -418.29998779296875, "loss": 0.1576, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17101439833641052, "rewards/margins": 5.580468654632568, "rewards/rejected": -5.40625, "step": 5490 }, { "epoch": 1.4496573537163944, "grad_norm": 56.81532267159033, "learning_rate": 6.376515550869795e-07, "logits/chosen": 0.33892822265625, "logits/rejected": 0.07464599609375, "logps/chosen": -385.8500061035156, "logps/rejected": -372.0, "loss": 0.0831, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4098663330078125, "rewards/margins": 5.157812595367432, "rewards/rejected": -4.747656345367432, "step": 5500 }, { "epoch": 1.452293094359515, "grad_norm": 37.31150855222955, "learning_rate": 6.369926199261992e-07, "logits/chosen": 0.26188963651657104, "logits/rejected": 0.029052734375, "logps/chosen": -415.0, "logps/rejected": -461.29998779296875, "loss": 0.1032, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6218627691268921, "rewards/margins": 5.828906059265137, "rewards/rejected": -5.206250190734863, "step": 5510 }, { "epoch": 1.4549288350026357, "grad_norm": 31.424097128367087, "learning_rate": 6.36333684765419e-07, "logits/chosen": 0.447784423828125, "logits/rejected": -0.03265380859375, "logps/chosen": -384.5, "logps/rejected": -361.54998779296875, "loss": 0.067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.32329100370407104, "rewards/margins": 5.108593940734863, "rewards/rejected": -4.787499904632568, "step": 5520 }, { "epoch": 1.4575645756457565, "grad_norm": 42.91501009493625, "learning_rate": 6.356747496046389e-07, "logits/chosen": 0.17647095024585724, "logits/rejected": 0.14620360732078552, "logps/chosen": -322.3500061035156, "logps/rejected": -366.3999938964844, "loss": 0.0833, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3792175352573395, "rewards/margins": 5.36328125, "rewards/rejected": -4.982812404632568, "step": 5530 }, { "epoch": 1.4602003162888773, "grad_norm": 15.572160874037653, "learning_rate": 6.350158144438586e-07, "logits/chosen": 0.03157959133386612, "logits/rejected": -0.08408813178539276, "logps/chosen": -393.79998779296875, "logps/rejected": -402.79998779296875, "loss": 0.0728, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4762206971645355, "rewards/margins": 5.627343654632568, "rewards/rejected": -5.1484375, "step": 5540 }, { "epoch": 1.462836056931998, "grad_norm": 54.43241392612363, "learning_rate": 6.343568792830785e-07, "logits/chosen": 0.17131805419921875, "logits/rejected": -0.04813232272863388, "logps/chosen": -398.29998779296875, "logps/rejected": -404.5, "loss": 0.087, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.17385253310203552, "rewards/margins": 5.296093940734863, "rewards/rejected": -5.471875190734863, "step": 5550 }, { "epoch": 1.4654717975751186, "grad_norm": 11.096015408817488, "learning_rate": 6.336979441222983e-07, "logits/chosen": 0.09285888820886612, "logits/rejected": -0.07669677585363388, "logps/chosen": -409.95001220703125, "logps/rejected": -388.5, "loss": 0.0682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.49492186307907104, "rewards/margins": 5.529687404632568, "rewards/rejected": -5.032812595367432, "step": 5560 }, { "epoch": 1.4681075382182394, "grad_norm": 61.00591625810504, "learning_rate": 6.330390089615182e-07, "logits/chosen": 0.22774887084960938, "logits/rejected": 0.06696166843175888, "logps/chosen": -357.75, "logps/rejected": -370.8500061035156, "loss": 0.0855, "rewards/accuracies": 0.96875, "rewards/chosen": 0.001708984375, "rewards/margins": 5.063281059265137, "rewards/rejected": -5.06640625, "step": 5570 }, { "epoch": 1.47074327886136, "grad_norm": 36.08292947650115, "learning_rate": 6.323800738007379e-07, "logits/chosen": 0.1443023681640625, "logits/rejected": 0.10687255859375, "logps/chosen": -334.04998779296875, "logps/rejected": -356.5, "loss": 0.1458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2818969786167145, "rewards/margins": 4.858593940734863, "rewards/rejected": -4.575781345367432, "step": 5580 }, { "epoch": 1.4733790195044807, "grad_norm": 18.39167533913701, "learning_rate": 6.317211386399578e-07, "logits/chosen": 0.27876585721969604, "logits/rejected": 0.10401000827550888, "logps/chosen": -367.79998779296875, "logps/rejected": -358.79998779296875, "loss": 0.0799, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4466919004917145, "rewards/margins": 5.146874904632568, "rewards/rejected": -4.703906059265137, "step": 5590 }, { "epoch": 1.4760147601476015, "grad_norm": 39.56271454518899, "learning_rate": 6.310622034791776e-07, "logits/chosen": 0.12489013373851776, "logits/rejected": 0.09317932277917862, "logps/chosen": -341.8999938964844, "logps/rejected": -387.8500061035156, "loss": 0.1262, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2859863340854645, "rewards/margins": 5.065625190734863, "rewards/rejected": -4.78125, "step": 5600 }, { "epoch": 1.4786505007907222, "grad_norm": 38.231826954240965, "learning_rate": 6.304032683183974e-07, "logits/chosen": 0.20942382514476776, "logits/rejected": -0.06331787258386612, "logps/chosen": -367.3999938964844, "logps/rejected": -384.0, "loss": 0.07, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.24417725205421448, "rewards/margins": 5.521874904632568, "rewards/rejected": -5.2734375, "step": 5610 }, { "epoch": 1.481286241433843, "grad_norm": 30.495608913987247, "learning_rate": 6.297443331576172e-07, "logits/chosen": 0.16251830756664276, "logits/rejected": -0.0032836913596838713, "logps/chosen": -340.20001220703125, "logps/rejected": -375.95001220703125, "loss": 0.0953, "rewards/accuracies": 0.96875, "rewards/chosen": -0.05327148362994194, "rewards/margins": 5.400781154632568, "rewards/rejected": -5.451562404632568, "step": 5620 }, { "epoch": 1.4839219820769636, "grad_norm": 45.36621845450597, "learning_rate": 6.290853979968371e-07, "logits/chosen": 0.156951904296875, "logits/rejected": -0.1116943359375, "logps/chosen": -374.1499938964844, "logps/rejected": -393.3999938964844, "loss": 0.0766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.04885254055261612, "rewards/margins": 5.172656059265137, "rewards/rejected": -5.22265625, "step": 5630 }, { "epoch": 1.4865577227200844, "grad_norm": 45.00279183339927, "learning_rate": 6.284264628360569e-07, "logits/chosen": 0.02204589918255806, "logits/rejected": -0.03823242336511612, "logps/chosen": -396.29998779296875, "logps/rejected": -381.8500061035156, "loss": 0.1473, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.0007812500116415322, "rewards/margins": 4.780468940734863, "rewards/rejected": -4.786718845367432, "step": 5640 }, { "epoch": 1.4891934633632051, "grad_norm": 36.19662090942447, "learning_rate": 6.277675276752768e-07, "logits/chosen": 0.060546875, "logits/rejected": 0.05380859225988388, "logps/chosen": -361.5, "logps/rejected": -407.1000061035156, "loss": 0.0961, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.086181640625, "rewards/margins": 5.59375, "rewards/rejected": -5.5078125, "step": 5650 }, { "epoch": 1.4918292040063257, "grad_norm": 16.805846889493004, "learning_rate": 6.271085925144965e-07, "logits/chosen": 0.12968750298023224, "logits/rejected": 0.0384521484375, "logps/chosen": -372.25, "logps/rejected": -392.3999938964844, "loss": 0.0985, "rewards/accuracies": 0.96875, "rewards/chosen": 0.584033191204071, "rewards/margins": 4.910937309265137, "rewards/rejected": -4.324999809265137, "step": 5660 }, { "epoch": 1.4944649446494465, "grad_norm": 69.71092235032559, "learning_rate": 6.264496573537164e-07, "logits/chosen": 0.03220214694738388, "logits/rejected": -0.09333495795726776, "logps/chosen": -353.29998779296875, "logps/rejected": -404.1000061035156, "loss": 0.1351, "rewards/accuracies": 0.9375, "rewards/chosen": 0.55426025390625, "rewards/margins": 5.064062595367432, "rewards/rejected": -4.505468845367432, "step": 5670 }, { "epoch": 1.4971006852925672, "grad_norm": 26.918783641826195, "learning_rate": 6.257907221929361e-07, "logits/chosen": 0.14638671278953552, "logits/rejected": 0.03224639967083931, "logps/chosen": -350.3500061035156, "logps/rejected": -367.75, "loss": 0.0898, "rewards/accuracies": 0.96875, "rewards/chosen": 0.699511706829071, "rewards/margins": 4.79296875, "rewards/rejected": -4.095312595367432, "step": 5680 }, { "epoch": 1.499736425935688, "grad_norm": 22.74991900033693, "learning_rate": 6.25131787032156e-07, "logits/chosen": 0.1164344772696495, "logits/rejected": -0.06043548509478569, "logps/chosen": -334.25, "logps/rejected": -348.3500061035156, "loss": 0.0854, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.14091797173023224, "rewards/margins": 4.706250190734863, "rewards/rejected": -4.564843654632568, "step": 5690 }, { "epoch": 1.5023721665788088, "grad_norm": 28.591503855896562, "learning_rate": 6.244728518713759e-07, "logits/chosen": 0.2632507383823395, "logits/rejected": 0.02891845628619194, "logps/chosen": -392.3500061035156, "logps/rejected": -411.8999938964844, "loss": 0.1103, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.34318238496780396, "rewards/margins": 5.259375095367432, "rewards/rejected": -4.917187690734863, "step": 5700 }, { "epoch": 1.5050079072219293, "grad_norm": 29.55278057032384, "learning_rate": 6.238139167105957e-07, "logits/chosen": 0.20981445908546448, "logits/rejected": 0.10256347805261612, "logps/chosen": -333.3500061035156, "logps/rejected": -388.04998779296875, "loss": 0.1218, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16431884467601776, "rewards/margins": 4.775781154632568, "rewards/rejected": -4.614062309265137, "step": 5710 }, { "epoch": 1.5076436478650501, "grad_norm": 9.579644035976946, "learning_rate": 6.231549815498155e-07, "logits/chosen": 0.05994568020105362, "logits/rejected": 0.06414794921875, "logps/chosen": -405.95001220703125, "logps/rejected": -403.3999938964844, "loss": 0.0976, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3204711973667145, "rewards/margins": 5.078906059265137, "rewards/rejected": -4.756249904632568, "step": 5720 }, { "epoch": 1.5102793885081707, "grad_norm": 37.75943841261397, "learning_rate": 6.224960463890353e-07, "logits/chosen": 0.13176269829273224, "logits/rejected": -0.014721679501235485, "logps/chosen": -337.1000061035156, "logps/rejected": -369.20001220703125, "loss": 0.0752, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.12598876655101776, "rewards/margins": 5.186718940734863, "rewards/rejected": -5.310937404632568, "step": 5730 }, { "epoch": 1.5129151291512914, "grad_norm": 42.02536408986192, "learning_rate": 6.218371112282551e-07, "logits/chosen": -0.0062500000931322575, "logits/rejected": -0.09783630073070526, "logps/chosen": -382.17498779296875, "logps/rejected": -379.6499938964844, "loss": 0.0973, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18970337510108948, "rewards/margins": 4.890625, "rewards/rejected": -4.700781345367432, "step": 5740 }, { "epoch": 1.5155508697944122, "grad_norm": 6.479251288296398, "learning_rate": 6.211781760674749e-07, "logits/chosen": 0.3144287168979645, "logits/rejected": -0.00010986327833961695, "logps/chosen": -415.6000061035156, "logps/rejected": -427.29998779296875, "loss": 0.076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2776855528354645, "rewards/margins": 5.508593559265137, "rewards/rejected": -5.235156059265137, "step": 5750 }, { "epoch": 1.518186610437533, "grad_norm": 20.88461368272018, "learning_rate": 6.205192409066947e-07, "logits/chosen": 0.10547332465648651, "logits/rejected": -0.18239136040210724, "logps/chosen": -423.20001220703125, "logps/rejected": -392.8500061035156, "loss": 0.0803, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3380371034145355, "rewards/margins": 5.278124809265137, "rewards/rejected": -4.939062595367432, "step": 5760 }, { "epoch": 1.5208223510806538, "grad_norm": 10.46747173758256, "learning_rate": 6.198603057459146e-07, "logits/chosen": 0.17626953125, "logits/rejected": -0.06401367485523224, "logps/chosen": -392.95001220703125, "logps/rejected": -429.6499938964844, "loss": 0.0604, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09230957180261612, "rewards/margins": 5.142968654632568, "rewards/rejected": -5.047656059265137, "step": 5770 }, { "epoch": 1.5234580917237743, "grad_norm": 41.26777085205621, "learning_rate": 6.192013705851344e-07, "logits/chosen": 0.177113339304924, "logits/rejected": 0.11131592094898224, "logps/chosen": -364.95001220703125, "logps/rejected": -414.6499938964844, "loss": 0.0553, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.2791503965854645, "rewards/margins": 5.295312404632568, "rewards/rejected": -5.01953125, "step": 5780 }, { "epoch": 1.526093832366895, "grad_norm": 52.02100203356942, "learning_rate": 6.185424354243543e-07, "logits/chosen": 0.16245117783546448, "logits/rejected": -0.07382812350988388, "logps/chosen": -352.0, "logps/rejected": -383.75, "loss": 0.102, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.14863280951976776, "rewards/margins": 5.178906440734863, "rewards/rejected": -5.032812595367432, "step": 5790 }, { "epoch": 1.5287295730100157, "grad_norm": 29.956777311271978, "learning_rate": 6.17883500263574e-07, "logits/chosen": 0.0026916502974927425, "logits/rejected": -0.02388915978372097, "logps/chosen": -365.95001220703125, "logps/rejected": -342.79998779296875, "loss": 0.1168, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.287649542093277, "rewards/margins": 5.131249904632568, "rewards/rejected": -4.842187404632568, "step": 5800 }, { "epoch": 1.5313653136531364, "grad_norm": 24.15683374769115, "learning_rate": 6.172245651027939e-07, "logits/chosen": 0.11976318061351776, "logits/rejected": -0.008685302920639515, "logps/chosen": -349.25, "logps/rejected": -398.6000061035156, "loss": 0.1109, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16889648139476776, "rewards/margins": 5.801562309265137, "rewards/rejected": -5.631249904632568, "step": 5810 }, { "epoch": 1.5340010542962572, "grad_norm": 43.07034035916491, "learning_rate": 6.165656299420137e-07, "logits/chosen": 0.05014648288488388, "logits/rejected": -0.10408935695886612, "logps/chosen": -398.75, "logps/rejected": -398.1499938964844, "loss": 0.0971, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.191375732421875, "rewards/margins": 5.435937404632568, "rewards/rejected": -5.242968559265137, "step": 5820 }, { "epoch": 1.536636794939378, "grad_norm": 9.461055000063697, "learning_rate": 6.159066947812335e-07, "logits/chosen": 0.10209961235523224, "logits/rejected": -0.05771484225988388, "logps/chosen": -337.95001220703125, "logps/rejected": -388.70001220703125, "loss": 0.0895, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.12277831882238388, "rewards/margins": 5.243750095367432, "rewards/rejected": -5.123437404632568, "step": 5830 }, { "epoch": 1.5392725355824988, "grad_norm": 96.78197858135505, "learning_rate": 6.152477596204533e-07, "logits/chosen": 0.132781982421875, "logits/rejected": -0.053863525390625, "logps/chosen": -370.5, "logps/rejected": -395.29998779296875, "loss": 0.1112, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14492186903953552, "rewards/margins": 5.54296875, "rewards/rejected": -5.400781154632568, "step": 5840 }, { "epoch": 1.5419082762256195, "grad_norm": 34.33957062873889, "learning_rate": 6.145888244596732e-07, "logits/chosen": 0.08034668117761612, "logits/rejected": 0.02197265625, "logps/chosen": -378.3999938964844, "logps/rejected": -422.0, "loss": 0.1165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.06041259691119194, "rewards/margins": 5.43359375, "rewards/rejected": -5.492968559265137, "step": 5850 }, { "epoch": 1.54454401686874, "grad_norm": 71.94811676807348, "learning_rate": 6.13929889298893e-07, "logits/chosen": -0.03605956956744194, "logits/rejected": 0.0038818358443677425, "logps/chosen": -369.75, "logps/rejected": -418.79998779296875, "loss": 0.1188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.577362060546875, "rewards/margins": 5.335156440734863, "rewards/rejected": -4.754687309265137, "step": 5860 }, { "epoch": 1.5471797575118609, "grad_norm": 13.361621194548992, "learning_rate": 6.132709541381129e-07, "logits/chosen": 0.12041320651769638, "logits/rejected": 0.0010498047340661287, "logps/chosen": -382.8999938964844, "logps/rejected": -397.79998779296875, "loss": 0.0818, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.3050247132778168, "rewards/margins": 5.10546875, "rewards/rejected": -4.801562309265137, "step": 5870 }, { "epoch": 1.5498154981549814, "grad_norm": 65.76109700292604, "learning_rate": 6.126120189773326e-07, "logits/chosen": 0.11796875298023224, "logits/rejected": -0.07803420722484589, "logps/chosen": -361.0, "logps/rejected": -390.1499938964844, "loss": 0.1525, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.35546875, "rewards/margins": 4.749218940734863, "rewards/rejected": -4.391797065734863, "step": 5880 }, { "epoch": 1.5524512387981022, "grad_norm": 25.88874308569559, "learning_rate": 6.119530838165525e-07, "logits/chosen": 0.12972870469093323, "logits/rejected": 0.0722251906991005, "logps/chosen": -359.3999938964844, "logps/rejected": -375.20001220703125, "loss": 0.1528, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.11951903998851776, "rewards/margins": 4.96484375, "rewards/rejected": -4.8515625, "step": 5890 }, { "epoch": 1.555086979441223, "grad_norm": 10.463017279612435, "learning_rate": 6.112941486557722e-07, "logits/chosen": 0.17390136420726776, "logits/rejected": -0.02052002027630806, "logps/chosen": -357.1499938964844, "logps/rejected": -366.0, "loss": 0.0648, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.2613281309604645, "rewards/margins": 4.966406345367432, "rewards/rejected": -4.703906059265137, "step": 5900 }, { "epoch": 1.5577227200843438, "grad_norm": 9.073584964207742, "learning_rate": 6.10635213494992e-07, "logits/chosen": 0.10364989936351776, "logits/rejected": 0.17438964545726776, "logps/chosen": -337.8999938964844, "logps/rejected": -379.3999938964844, "loss": 0.0863, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3426513671875, "rewards/margins": 4.880468845367432, "rewards/rejected": -4.533593654632568, "step": 5910 }, { "epoch": 1.5603584607274645, "grad_norm": 40.509613705193985, "learning_rate": 6.099762783342118e-07, "logits/chosen": 0.15765686333179474, "logits/rejected": 0.06198730319738388, "logps/chosen": -405.1000061035156, "logps/rejected": -368.45001220703125, "loss": 0.0792, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.737500011920929, "rewards/margins": 4.974999904632568, "rewards/rejected": -4.236718654632568, "step": 5920 }, { "epoch": 1.562994201370585, "grad_norm": 53.2072720146459, "learning_rate": 6.093173431734317e-07, "logits/chosen": 0.21263428032398224, "logits/rejected": -0.05875549465417862, "logps/chosen": -381.70001220703125, "logps/rejected": -374.70001220703125, "loss": 0.071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.825183093547821, "rewards/margins": 5.209374904632568, "rewards/rejected": -4.380468845367432, "step": 5930 }, { "epoch": 1.5656299420137059, "grad_norm": 33.65932982283109, "learning_rate": 6.086584080126516e-07, "logits/chosen": 0.13139037787914276, "logits/rejected": -0.0056243897415697575, "logps/chosen": -358.1000061035156, "logps/rejected": -385.20001220703125, "loss": 0.118, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.6470702886581421, "rewards/margins": 4.682812690734863, "rewards/rejected": -4.037499904632568, "step": 5940 }, { "epoch": 1.5682656826568264, "grad_norm": 14.344649018921281, "learning_rate": 6.079994728518713e-07, "logits/chosen": 0.16937103867530823, "logits/rejected": 0.015594482421875, "logps/chosen": -413.45001220703125, "logps/rejected": -389.8500061035156, "loss": 0.0916, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.3946166932582855, "rewards/margins": 5.314843654632568, "rewards/rejected": -4.914843559265137, "step": 5950 }, { "epoch": 1.5709014232999472, "grad_norm": 59.938184933494995, "learning_rate": 6.073405376910912e-07, "logits/chosen": 0.017333984375, "logits/rejected": -0.11052551120519638, "logps/chosen": -360.6000061035156, "logps/rejected": -386.1000061035156, "loss": 0.074, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.566485583782196, "rewards/margins": 5.228906154632568, "rewards/rejected": -4.6640625, "step": 5960 }, { "epoch": 1.573537163943068, "grad_norm": 32.75184899673545, "learning_rate": 6.066816025303109e-07, "logits/chosen": 0.08965606987476349, "logits/rejected": 0.031280517578125, "logps/chosen": -373.29998779296875, "logps/rejected": -420.5, "loss": 0.1374, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.39997559785842896, "rewards/margins": 5.451562404632568, "rewards/rejected": -5.049218654632568, "step": 5970 }, { "epoch": 1.5761729045861887, "grad_norm": 58.09070977404346, "learning_rate": 6.060226673695308e-07, "logits/chosen": 0.17760315537452698, "logits/rejected": -0.0031005858909338713, "logps/chosen": -382.45001220703125, "logps/rejected": -347.45001220703125, "loss": 0.0816, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20927734673023224, "rewards/margins": 4.8125, "rewards/rejected": -4.6015625, "step": 5980 }, { "epoch": 1.5788086452293095, "grad_norm": 16.96758599210906, "learning_rate": 6.053637322087506e-07, "logits/chosen": 0.124298095703125, "logits/rejected": -0.07357177883386612, "logps/chosen": -362.70001220703125, "logps/rejected": -361.54998779296875, "loss": 0.0827, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5251220464706421, "rewards/margins": 5.099999904632568, "rewards/rejected": -4.578906059265137, "step": 5990 }, { "epoch": 1.5814443858724303, "grad_norm": 50.60692220151019, "learning_rate": 6.047047970479705e-07, "logits/chosen": 0.11917724460363388, "logits/rejected": 0.02506103552877903, "logps/chosen": -375.45001220703125, "logps/rejected": -412.20001220703125, "loss": 0.0892, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.00848388671875, "rewards/margins": 4.997656345367432, "rewards/rejected": -5.003125190734863, "step": 6000 }, { "epoch": 1.5840801265155509, "grad_norm": 28.572463672478627, "learning_rate": 6.040458618871903e-07, "logits/chosen": -0.02485351637005806, "logits/rejected": -0.14720459282398224, "logps/chosen": -405.6499938964844, "logps/rejected": -445.0, "loss": 0.0938, "rewards/accuracies": 0.96875, "rewards/chosen": -0.177703857421875, "rewards/margins": 5.918749809265137, "rewards/rejected": -6.095312595367432, "step": 6010 }, { "epoch": 1.5867158671586716, "grad_norm": 63.99845340872856, "learning_rate": 6.033869267264101e-07, "logits/chosen": 0.08999023586511612, "logits/rejected": -0.24763794243335724, "logps/chosen": -383.1000061035156, "logps/rejected": -403.5, "loss": 0.0868, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0372314453125, "rewards/margins": 5.470312595367432, "rewards/rejected": -5.4296875, "step": 6020 }, { "epoch": 1.5893516078017922, "grad_norm": 73.68810285512379, "learning_rate": 6.027279915656299e-07, "logits/chosen": 0.0853271484375, "logits/rejected": -0.1029052734375, "logps/chosen": -335.29998779296875, "logps/rejected": -377.1000061035156, "loss": 0.0867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.03745117038488388, "rewards/margins": 5.05859375, "rewards/rejected": -5.094531059265137, "step": 6030 }, { "epoch": 1.591987348444913, "grad_norm": 44.43935653385135, "learning_rate": 6.020690564048498e-07, "logits/chosen": 0.15538330376148224, "logits/rejected": -0.1138916015625, "logps/chosen": -399.29998779296875, "logps/rejected": -409.04998779296875, "loss": 0.0623, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.007275390438735485, "rewards/margins": 5.434374809265137, "rewards/rejected": -5.428906440734863, "step": 6040 }, { "epoch": 1.5946230890880337, "grad_norm": 30.000917755435758, "learning_rate": 6.014101212440695e-07, "logits/chosen": 0.0041259764693677425, "logits/rejected": -0.06866455078125, "logps/chosen": -384.70001220703125, "logps/rejected": -402.45001220703125, "loss": 0.0756, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03395996242761612, "rewards/margins": 5.482812404632568, "rewards/rejected": -5.522656440734863, "step": 6050 }, { "epoch": 1.5972588297311545, "grad_norm": 44.921088192874194, "learning_rate": 6.007511860832894e-07, "logits/chosen": 0.16171875596046448, "logits/rejected": 0.02955932542681694, "logps/chosen": -381.5, "logps/rejected": -334.25, "loss": 0.1254, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.010882568545639515, "rewards/margins": 4.848437309265137, "rewards/rejected": -4.86328125, "step": 6060 }, { "epoch": 1.5998945703742753, "grad_norm": 94.31684911137074, "learning_rate": 6.000922509225091e-07, "logits/chosen": 0.07965698093175888, "logits/rejected": -0.07890625298023224, "logps/chosen": -366.54998779296875, "logps/rejected": -389.54998779296875, "loss": 0.111, "rewards/accuracies": 0.96875, "rewards/chosen": 0.08896484225988388, "rewards/margins": 5.239062309265137, "rewards/rejected": -5.149218559265137, "step": 6070 }, { "epoch": 1.6025303110173958, "grad_norm": 14.610011998410085, "learning_rate": 5.994333157617291e-07, "logits/chosen": 0.21434326469898224, "logits/rejected": 0.09709472954273224, "logps/chosen": -341.8500061035156, "logps/rejected": -380.1000061035156, "loss": 0.1339, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.26463621854782104, "rewards/margins": 5.053124904632568, "rewards/rejected": -4.7890625, "step": 6080 }, { "epoch": 1.6051660516605166, "grad_norm": 15.132048695385405, "learning_rate": 5.987743806009488e-07, "logits/chosen": 0.24066925048828125, "logits/rejected": -0.05586547777056694, "logps/chosen": -400.70001220703125, "logps/rejected": -422.3999938964844, "loss": 0.1043, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22989502549171448, "rewards/margins": 5.334374904632568, "rewards/rejected": -5.106249809265137, "step": 6090 }, { "epoch": 1.6078017923036372, "grad_norm": 26.84967661111668, "learning_rate": 5.981154454401687e-07, "logits/chosen": 0.2392425537109375, "logits/rejected": 0.10313110053539276, "logps/chosen": -408.3999938964844, "logps/rejected": -402.6000061035156, "loss": 0.1048, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4610351622104645, "rewards/margins": 5.016406059265137, "rewards/rejected": -4.55078125, "step": 6100 }, { "epoch": 1.610437532946758, "grad_norm": 20.495570269666594, "learning_rate": 5.974565102793885e-07, "logits/chosen": 0.21099853515625, "logits/rejected": 0.02430419996380806, "logps/chosen": -352.75, "logps/rejected": -379.20001220703125, "loss": 0.0658, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.40898436307907104, "rewards/margins": 5.11328125, "rewards/rejected": -4.706250190734863, "step": 6110 }, { "epoch": 1.6130732735898787, "grad_norm": 15.4991211712623, "learning_rate": 5.967975751186083e-07, "logits/chosen": 0.10148926079273224, "logits/rejected": -0.00897216796875, "logps/chosen": -367.1499938964844, "logps/rejected": -389.54998779296875, "loss": 0.0742, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.41212159395217896, "rewards/margins": 5.653906345367432, "rewards/rejected": -5.240624904632568, "step": 6120 }, { "epoch": 1.6157090142329995, "grad_norm": 24.254876402605355, "learning_rate": 5.961386399578281e-07, "logits/chosen": -0.04119262844324112, "logits/rejected": -0.09423370659351349, "logps/chosen": -357.6000061035156, "logps/rejected": -392.54998779296875, "loss": 0.1024, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.13505859673023224, "rewards/margins": 5.354687690734863, "rewards/rejected": -5.21875, "step": 6130 }, { "epoch": 1.6183447548761203, "grad_norm": 74.56169996073008, "learning_rate": 5.954797047970479e-07, "logits/chosen": 0.05362548679113388, "logits/rejected": -0.08824463188648224, "logps/chosen": -326.6499938964844, "logps/rejected": -383.20001220703125, "loss": 0.1212, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.21245117485523224, "rewards/margins": 5.063281059265137, "rewards/rejected": -4.849218845367432, "step": 6140 }, { "epoch": 1.620980495519241, "grad_norm": 25.860989486999255, "learning_rate": 5.948207696362678e-07, "logits/chosen": 0.05231933668255806, "logits/rejected": -0.03992919996380806, "logps/chosen": -323.45001220703125, "logps/rejected": -381.8999938964844, "loss": 0.1283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.32020872831344604, "rewards/margins": 5.090624809265137, "rewards/rejected": -4.771093845367432, "step": 6150 }, { "epoch": 1.6236162361623616, "grad_norm": 33.13551728696184, "learning_rate": 5.941618344754877e-07, "logits/chosen": 0.20269469916820526, "logits/rejected": 0.13744506239891052, "logps/chosen": -390.7250061035156, "logps/rejected": -404.20001220703125, "loss": 0.0934, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8448730707168579, "rewards/margins": 5.366406440734863, "rewards/rejected": -4.517968654632568, "step": 6160 }, { "epoch": 1.6262519768054824, "grad_norm": 32.70424551909577, "learning_rate": 5.935028993147074e-07, "logits/chosen": 0.00948486290872097, "logits/rejected": -0.02633056603372097, "logps/chosen": -330.3999938964844, "logps/rejected": -367.3999938964844, "loss": 0.1458, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.6113647222518921, "rewards/margins": 4.78515625, "rewards/rejected": -4.175000190734863, "step": 6170 }, { "epoch": 1.628887717448603, "grad_norm": 29.204363518863914, "learning_rate": 5.928439641539273e-07, "logits/chosen": 0.21817627549171448, "logits/rejected": -0.03790893405675888, "logps/chosen": -367.20001220703125, "logps/rejected": -378.75, "loss": 0.1204, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5346008539199829, "rewards/margins": 4.9296875, "rewards/rejected": -4.392968654632568, "step": 6180 }, { "epoch": 1.6315234580917237, "grad_norm": 20.738660622138475, "learning_rate": 5.92185028993147e-07, "logits/chosen": 0.10102538764476776, "logits/rejected": -0.0057319640181958675, "logps/chosen": -376.70001220703125, "logps/rejected": -389.3999938964844, "loss": 0.1247, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.006518554873764515, "rewards/margins": 4.889062404632568, "rewards/rejected": -4.903124809265137, "step": 6190 }, { "epoch": 1.6341591987348445, "grad_norm": 21.966372014547286, "learning_rate": 5.915260938323669e-07, "logits/chosen": 0.15300293266773224, "logits/rejected": -0.12579345703125, "logps/chosen": -346.25, "logps/rejected": -381.1499938964844, "loss": 0.0805, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08808593451976776, "rewards/margins": 5.299218654632568, "rewards/rejected": -5.387499809265137, "step": 6200 }, { "epoch": 1.6367949393779653, "grad_norm": 45.12079800990217, "learning_rate": 5.908671586715866e-07, "logits/chosen": 0.07229004055261612, "logits/rejected": -0.0186614990234375, "logps/chosen": -325.42498779296875, "logps/rejected": -377.1000061035156, "loss": 0.1031, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10861816257238388, "rewards/margins": 5.305468559265137, "rewards/rejected": -5.196875095367432, "step": 6210 }, { "epoch": 1.639430680021086, "grad_norm": 18.97500780109545, "learning_rate": 5.902082235108064e-07, "logits/chosen": 0.16729736328125, "logits/rejected": -0.03989257663488388, "logps/chosen": -396.3500061035156, "logps/rejected": -401.6000061035156, "loss": 0.0903, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.09702148288488388, "rewards/margins": 5.137499809265137, "rewards/rejected": -5.035937309265137, "step": 6220 }, { "epoch": 1.6420664206642066, "grad_norm": 73.96383895863494, "learning_rate": 5.895492883500264e-07, "logits/chosen": 0.16242675483226776, "logits/rejected": -0.004656982608139515, "logps/chosen": -391.8500061035156, "logps/rejected": -428.29998779296875, "loss": 0.0886, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.03959960862994194, "rewards/margins": 5.641406059265137, "rewards/rejected": -5.682031154632568, "step": 6230 }, { "epoch": 1.6447021613073274, "grad_norm": 35.03606271362049, "learning_rate": 5.888903531892462e-07, "logits/chosen": 0.08155517280101776, "logits/rejected": -0.14664307236671448, "logps/chosen": -359.0, "logps/rejected": -383.20001220703125, "loss": 0.1001, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.03085937537252903, "rewards/margins": 5.62890625, "rewards/rejected": -5.65625, "step": 6240 }, { "epoch": 1.647337901950448, "grad_norm": 41.31257418782067, "learning_rate": 5.88231418028466e-07, "logits/chosen": 0.02144775353372097, "logits/rejected": -0.07607726752758026, "logps/chosen": -346.8500061035156, "logps/rejected": -362.04998779296875, "loss": 0.1381, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.23245850205421448, "rewards/margins": 4.703906059265137, "rewards/rejected": -4.931250095367432, "step": 6250 }, { "epoch": 1.6499736425935687, "grad_norm": 39.6326068332139, "learning_rate": 5.875724828676857e-07, "logits/chosen": 0.10953979194164276, "logits/rejected": -0.0503082275390625, "logps/chosen": -388.79998779296875, "logps/rejected": -405.3999938964844, "loss": 0.0889, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24114990234375, "rewards/margins": 4.996874809265137, "rewards/rejected": -5.234375, "step": 6260 }, { "epoch": 1.6526093832366895, "grad_norm": 12.4018623447746, "learning_rate": 5.869135477069056e-07, "logits/chosen": -0.013824462890625, "logits/rejected": -0.14398193359375, "logps/chosen": -368.8500061035156, "logps/rejected": -392.3999938964844, "loss": 0.0804, "rewards/accuracies": 0.96875, "rewards/chosen": 0.16634520888328552, "rewards/margins": 5.578906059265137, "rewards/rejected": -5.412499904632568, "step": 6270 }, { "epoch": 1.6552451238798103, "grad_norm": 55.95228321622098, "learning_rate": 5.862546125461254e-07, "logits/chosen": 0.08083190768957138, "logits/rejected": -0.01847534254193306, "logps/chosen": -376.20001220703125, "logps/rejected": -383.95001220703125, "loss": 0.1169, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2490234375, "rewards/margins": 4.831250190734863, "rewards/rejected": -4.582812309265137, "step": 6280 }, { "epoch": 1.657880864522931, "grad_norm": 47.59316712425632, "learning_rate": 5.855956773853452e-07, "logits/chosen": 0.146453857421875, "logits/rejected": 0.03303222730755806, "logps/chosen": -388.04998779296875, "logps/rejected": -387.8999938964844, "loss": 0.0751, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.553466796875, "rewards/margins": 5.121874809265137, "rewards/rejected": -4.563281059265137, "step": 6290 }, { "epoch": 1.6605166051660518, "grad_norm": 26.931718974467014, "learning_rate": 5.849367422245651e-07, "logits/chosen": 0.27104490995407104, "logits/rejected": 0.05150756984949112, "logps/chosen": -350.82501220703125, "logps/rejected": -385.95001220703125, "loss": 0.0913, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8334716558456421, "rewards/margins": 5.354687690734863, "rewards/rejected": -4.515625, "step": 6300 }, { "epoch": 1.6631523458091724, "grad_norm": 32.68403239064545, "learning_rate": 5.842778070637849e-07, "logits/chosen": 0.33208006620407104, "logits/rejected": 0.20991821587085724, "logps/chosen": -393.6000061035156, "logps/rejected": -402.1499938964844, "loss": 0.0939, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.635327160358429, "rewards/margins": 5.181250095367432, "rewards/rejected": -4.550000190734863, "step": 6310 }, { "epoch": 1.6657880864522931, "grad_norm": 61.980477341918835, "learning_rate": 5.836188719030047e-07, "logits/chosen": 0.24866943061351776, "logits/rejected": 0.02709350548684597, "logps/chosen": -387.3999938964844, "logps/rejected": -386.3999938964844, "loss": 0.0822, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25609129667282104, "rewards/margins": 4.873437404632568, "rewards/rejected": -4.612500190734863, "step": 6320 }, { "epoch": 1.6684238270954137, "grad_norm": 15.766327199009085, "learning_rate": 5.829599367422246e-07, "logits/chosen": 0.10502929985523224, "logits/rejected": 0.02001953125, "logps/chosen": -362.6499938964844, "logps/rejected": -418.8500061035156, "loss": 0.1259, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.2979492247104645, "rewards/margins": 5.224999904632568, "rewards/rejected": -4.926562309265137, "step": 6330 }, { "epoch": 1.6710595677385345, "grad_norm": 40.01351774875091, "learning_rate": 5.823010015814443e-07, "logits/chosen": 0.099578857421875, "logits/rejected": 0.10645751655101776, "logps/chosen": -361.79998779296875, "logps/rejected": -388.20001220703125, "loss": 0.0855, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.04907836765050888, "rewards/margins": 5.256249904632568, "rewards/rejected": -5.299218654632568, "step": 6340 }, { "epoch": 1.6736953083816553, "grad_norm": 49.27040563407566, "learning_rate": 5.816420664206642e-07, "logits/chosen": 0.11818847805261612, "logits/rejected": -0.11571349948644638, "logps/chosen": -370.54998779296875, "logps/rejected": -408.1499938964844, "loss": 0.0973, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2648071348667145, "rewards/margins": 5.360937595367432, "rewards/rejected": -5.095312595367432, "step": 6350 }, { "epoch": 1.676331049024776, "grad_norm": 27.596371058808227, "learning_rate": 5.809831312598839e-07, "logits/chosen": 0.29206544160842896, "logits/rejected": -0.06273193657398224, "logps/chosen": -351.95001220703125, "logps/rejected": -345.29998779296875, "loss": 0.1009, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.13643798232078552, "rewards/margins": 5.103125095367432, "rewards/rejected": -4.96484375, "step": 6360 }, { "epoch": 1.6789667896678968, "grad_norm": 35.114943658782046, "learning_rate": 5.803241960991038e-07, "logits/chosen": 0.13833007216453552, "logits/rejected": 0.02576904371380806, "logps/chosen": -354.75, "logps/rejected": -411.04998779296875, "loss": 0.0807, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.16071777045726776, "rewards/margins": 5.323437690734863, "rewards/rejected": -5.163281440734863, "step": 6370 }, { "epoch": 1.6816025303110174, "grad_norm": 48.220409575014465, "learning_rate": 5.796652609383237e-07, "logits/chosen": 0.04996337741613388, "logits/rejected": -0.12327422946691513, "logps/chosen": -337.3500061035156, "logps/rejected": -384.29998779296875, "loss": 0.1082, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10272216796875, "rewards/margins": 4.98046875, "rewards/rejected": -5.078906059265137, "step": 6380 }, { "epoch": 1.6842382709541381, "grad_norm": 61.36460085868427, "learning_rate": 5.790063257775435e-07, "logits/chosen": 0.151947021484375, "logits/rejected": 0.02393188513815403, "logps/chosen": -397.8999938964844, "logps/rejected": -375.70001220703125, "loss": 0.0939, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.4963622987270355, "rewards/margins": 5.025781154632568, "rewards/rejected": -5.522656440734863, "step": 6390 }, { "epoch": 1.6868740115972587, "grad_norm": 251.659500312642, "learning_rate": 5.783473906167633e-07, "logits/chosen": 0.02041015587747097, "logits/rejected": -0.04822998121380806, "logps/chosen": -390.1499938964844, "logps/rejected": -399.45001220703125, "loss": 0.106, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.41046142578125, "rewards/margins": 5.610937595367432, "rewards/rejected": -6.025000095367432, "step": 6400 }, { "epoch": 1.6895097522403795, "grad_norm": 155.0840972551248, "learning_rate": 5.776884554559831e-07, "logits/chosen": 0.16371765732765198, "logits/rejected": 0.0430755615234375, "logps/chosen": -389.29998779296875, "logps/rejected": -403.54998779296875, "loss": 0.0949, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07391357421875, "rewards/margins": 5.29296875, "rewards/rejected": -5.21484375, "step": 6410 }, { "epoch": 1.6921454928835002, "grad_norm": 37.40933865956503, "learning_rate": 5.770295202952029e-07, "logits/chosen": 0.13482055068016052, "logits/rejected": 0.002166748046875, "logps/chosen": -342.1000061035156, "logps/rejected": -411.29998779296875, "loss": 0.0844, "rewards/accuracies": 0.96875, "rewards/chosen": -0.02031249925494194, "rewards/margins": 5.749218940734863, "rewards/rejected": -5.766406059265137, "step": 6420 }, { "epoch": 1.694781233526621, "grad_norm": 81.04989689091896, "learning_rate": 5.763705851344227e-07, "logits/chosen": 0.09526367485523224, "logits/rejected": 0.05974731594324112, "logps/chosen": -378.95001220703125, "logps/rejected": -396.54998779296875, "loss": 0.1267, "rewards/accuracies": 0.96875, "rewards/chosen": -0.16942748427391052, "rewards/margins": 4.814843654632568, "rewards/rejected": -4.979687690734863, "step": 6430 }, { "epoch": 1.6974169741697418, "grad_norm": 30.489113004308056, "learning_rate": 5.757116499736425e-07, "logits/chosen": 0.114776611328125, "logits/rejected": -0.06633301079273224, "logps/chosen": -354.7250061035156, "logps/rejected": -389.5, "loss": 0.0957, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.38972169160842896, "rewards/margins": 5.71484375, "rewards/rejected": -5.331250190734863, "step": 6440 }, { "epoch": 1.7000527148128626, "grad_norm": 28.812970969389138, "learning_rate": 5.750527148128625e-07, "logits/chosen": 0.19647522270679474, "logits/rejected": 0.01485595665872097, "logps/chosen": -394.3999938964844, "logps/rejected": -376.1000061035156, "loss": 0.1299, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.24074706435203552, "rewards/margins": 5.322656154632568, "rewards/rejected": -5.078125, "step": 6450 }, { "epoch": 1.7026884554559831, "grad_norm": 36.53821190639969, "learning_rate": 5.743937796520822e-07, "logits/chosen": 0.02763671800494194, "logits/rejected": -0.12427368015050888, "logps/chosen": -412.75, "logps/rejected": -414.25, "loss": 0.0872, "rewards/accuracies": 0.96875, "rewards/chosen": 0.01602783240377903, "rewards/margins": 5.188281059265137, "rewards/rejected": -5.171093940734863, "step": 6460 }, { "epoch": 1.7053241960991037, "grad_norm": 39.85217707135062, "learning_rate": 5.737348444913021e-07, "logits/chosen": 0.005810546688735485, "logits/rejected": -0.06950683891773224, "logps/chosen": -336.25, "logps/rejected": -367.8500061035156, "loss": 0.1121, "rewards/accuracies": 0.96875, "rewards/chosen": -0.14108887314796448, "rewards/margins": 4.8671875, "rewards/rejected": -5.009375095367432, "step": 6470 }, { "epoch": 1.7079599367422245, "grad_norm": 43.577054668513604, "learning_rate": 5.730759093305218e-07, "logits/chosen": 0.14401856064796448, "logits/rejected": 0.04774169996380806, "logps/chosen": -388.5, "logps/rejected": -416.1000061035156, "loss": 0.1237, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.06503906100988388, "rewards/margins": 5.360937595367432, "rewards/rejected": -5.29296875, "step": 6480 }, { "epoch": 1.7105956773853452, "grad_norm": 8.549776097068, "learning_rate": 5.724169741697417e-07, "logits/chosen": 0.14169922471046448, "logits/rejected": 0.0074554444290697575, "logps/chosen": -361.0, "logps/rejected": -367.8500061035156, "loss": 0.055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09816894680261612, "rewards/margins": 5.405468940734863, "rewards/rejected": -5.509375095367432, "step": 6490 }, { "epoch": 1.713231418028466, "grad_norm": 50.08997194819138, "learning_rate": 5.717580390089615e-07, "logits/chosen": 0.13241882622241974, "logits/rejected": 0.054901123046875, "logps/chosen": -357.1000061035156, "logps/rejected": -398.29998779296875, "loss": 0.1107, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.03440551832318306, "rewards/margins": 5.698437690734863, "rewards/rejected": -5.739062309265137, "step": 6500 }, { "epoch": 1.7158671586715868, "grad_norm": 48.631749639819056, "learning_rate": 5.710991038481813e-07, "logits/chosen": 0.18253174424171448, "logits/rejected": -0.04398803785443306, "logps/chosen": -397.8999938964844, "logps/rejected": -408.79998779296875, "loss": 0.0799, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04635009914636612, "rewards/margins": 5.699999809265137, "rewards/rejected": -5.6484375, "step": 6510 }, { "epoch": 1.7185028993147076, "grad_norm": 85.405181012338, "learning_rate": 5.704401686874011e-07, "logits/chosen": 0.038330078125, "logits/rejected": -0.02720947191119194, "logps/chosen": -397.79998779296875, "logps/rejected": -393.45001220703125, "loss": 0.1261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2783203125, "rewards/margins": 5.263281345367432, "rewards/rejected": -5.543749809265137, "step": 6520 }, { "epoch": 1.7211386399578281, "grad_norm": 23.13423250262512, "learning_rate": 5.69781233526621e-07, "logits/chosen": 0.15799561142921448, "logits/rejected": -0.02536621131002903, "logps/chosen": -319.5, "logps/rejected": -363.29998779296875, "loss": 0.0757, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.134765625, "rewards/margins": 5.022656440734863, "rewards/rejected": -5.1640625, "step": 6530 }, { "epoch": 1.723774380600949, "grad_norm": 36.2268797726589, "learning_rate": 5.691222983658408e-07, "logits/chosen": 0.14946289360523224, "logits/rejected": 0.08229980617761612, "logps/chosen": -370.6499938964844, "logps/rejected": -401.8999938964844, "loss": 0.0681, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05351562425494194, "rewards/margins": 5.517968654632568, "rewards/rejected": -5.46484375, "step": 6540 }, { "epoch": 1.7264101212440695, "grad_norm": 40.944962241714656, "learning_rate": 5.684633632050607e-07, "logits/chosen": 0.1204071044921875, "logits/rejected": -0.04855956882238388, "logps/chosen": -379.20001220703125, "logps/rejected": -396.5, "loss": 0.0892, "rewards/accuracies": 0.96875, "rewards/chosen": 0.08798827975988388, "rewards/margins": 5.154687404632568, "rewards/rejected": -5.06640625, "step": 6550 }, { "epoch": 1.7290458618871902, "grad_norm": 50.884467096469024, "learning_rate": 5.678044280442804e-07, "logits/chosen": 0.2541565001010895, "logits/rejected": -0.022064208984375, "logps/chosen": -432.75, "logps/rejected": -426.79998779296875, "loss": 0.0585, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.48704832792282104, "rewards/margins": 5.927343845367432, "rewards/rejected": -5.451562404632568, "step": 6560 }, { "epoch": 1.731681602530311, "grad_norm": 51.60960715491739, "learning_rate": 5.671454928835003e-07, "logits/chosen": 0.03933105617761612, "logits/rejected": -0.031982421875, "logps/chosen": -362.8500061035156, "logps/rejected": -364.04998779296875, "loss": 0.0711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.28095704317092896, "rewards/margins": 5.076562404632568, "rewards/rejected": -5.357031345367432, "step": 6570 }, { "epoch": 1.7343173431734318, "grad_norm": 86.53538022915647, "learning_rate": 5.6648655772272e-07, "logits/chosen": 0.14658203721046448, "logits/rejected": -0.11690368503332138, "logps/chosen": -382.79998779296875, "logps/rejected": -402.45001220703125, "loss": 0.1539, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.00783691368997097, "rewards/margins": 5.536328315734863, "rewards/rejected": -5.536718845367432, "step": 6580 }, { "epoch": 1.7369530838165526, "grad_norm": 47.87854462188389, "learning_rate": 5.658276225619398e-07, "logits/chosen": 0.15039673447608948, "logits/rejected": 0.07003173977136612, "logps/chosen": -340.45001220703125, "logps/rejected": -389.5, "loss": 0.1274, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12359313666820526, "rewards/margins": 4.858593940734863, "rewards/rejected": -4.734375, "step": 6590 }, { "epoch": 1.7395888244596733, "grad_norm": 33.56250089013881, "learning_rate": 5.651686874011597e-07, "logits/chosen": 0.22943115234375, "logits/rejected": -0.09084472805261612, "logps/chosen": -430.5, "logps/rejected": -385.3999938964844, "loss": 0.0665, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5505615472793579, "rewards/margins": 5.766406059265137, "rewards/rejected": -5.2109375, "step": 6600 }, { "epoch": 1.7422245651027939, "grad_norm": 10.181705202956582, "learning_rate": 5.645097522403796e-07, "logits/chosen": 0.27705079317092896, "logits/rejected": 0.193359375, "logps/chosen": -367.79998779296875, "logps/rejected": -390.5, "loss": 0.1286, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5277099609375, "rewards/margins": 5.104687690734863, "rewards/rejected": -4.578125, "step": 6610 }, { "epoch": 1.7448603057459144, "grad_norm": 37.11899375260548, "learning_rate": 5.638508170795994e-07, "logits/chosen": 0.23803099989891052, "logits/rejected": 0.07681884616613388, "logps/chosen": -368.95001220703125, "logps/rejected": -399.3999938964844, "loss": 0.1123, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.27019041776657104, "rewards/margins": 5.417187690734863, "rewards/rejected": -5.14453125, "step": 6620 }, { "epoch": 1.7474960463890352, "grad_norm": 26.541923778733604, "learning_rate": 5.631918819188191e-07, "logits/chosen": 0.12042236328125, "logits/rejected": 0.08328857272863388, "logps/chosen": -369.45001220703125, "logps/rejected": -406.20001220703125, "loss": 0.0898, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.706591784954071, "rewards/margins": 5.71484375, "rewards/rejected": -5.005468845367432, "step": 6630 }, { "epoch": 1.750131787032156, "grad_norm": 74.7899151528272, "learning_rate": 5.62532946758039e-07, "logits/chosen": 0.2997116148471832, "logits/rejected": 0.10543213039636612, "logps/chosen": -401.54998779296875, "logps/rejected": -389.95001220703125, "loss": 0.1396, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.22484131157398224, "rewards/margins": 4.940625190734863, "rewards/rejected": -4.713281154632568, "step": 6640 }, { "epoch": 1.7527675276752768, "grad_norm": 29.507737446260087, "learning_rate": 5.618740115972587e-07, "logits/chosen": 0.15867920219898224, "logits/rejected": 0.02221069298684597, "logps/chosen": -384.70001220703125, "logps/rejected": -393.1000061035156, "loss": 0.0781, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.06855468451976776, "rewards/margins": 5.414843559265137, "rewards/rejected": -5.484375, "step": 6650 }, { "epoch": 1.7554032683183975, "grad_norm": 36.651331871830855, "learning_rate": 5.612150764364786e-07, "logits/chosen": 0.03300781175494194, "logits/rejected": -0.03798675537109375, "logps/chosen": -321.20001220703125, "logps/rejected": -348.54998779296875, "loss": 0.1009, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.436026006937027, "rewards/margins": 5.217968940734863, "rewards/rejected": -5.657031059265137, "step": 6660 }, { "epoch": 1.7580390089615183, "grad_norm": 12.15443254315024, "learning_rate": 5.605561412756984e-07, "logits/chosen": -0.06990966945886612, "logits/rejected": -0.10540771484375, "logps/chosen": -318.5, "logps/rejected": -343.5, "loss": 0.1347, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45878905057907104, "rewards/margins": 4.946093559265137, "rewards/rejected": -5.407812595367432, "step": 6670 }, { "epoch": 1.7606747496046389, "grad_norm": 75.72670147001037, "learning_rate": 5.598972061149183e-07, "logits/chosen": 0.06893310695886612, "logits/rejected": 0.02826538123190403, "logps/chosen": -327.04998779296875, "logps/rejected": -378.0, "loss": 0.1123, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.634814441204071, "rewards/margins": 5.08203125, "rewards/rejected": -5.715624809265137, "step": 6680 }, { "epoch": 1.7633104902477597, "grad_norm": 12.003702227757325, "learning_rate": 5.592382709541381e-07, "logits/chosen": 0.03791503980755806, "logits/rejected": -0.10894775390625, "logps/chosen": -391.20001220703125, "logps/rejected": -393.1000061035156, "loss": 0.1049, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0036132812965661287, "rewards/margins": 5.616406440734863, "rewards/rejected": -5.618750095367432, "step": 6690 }, { "epoch": 1.7659462308908802, "grad_norm": 11.967257240385967, "learning_rate": 5.585793357933579e-07, "logits/chosen": 0.15295104682445526, "logits/rejected": -0.0284423828125, "logps/chosen": -376.0, "logps/rejected": -385.8999938964844, "loss": 0.0883, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.35328370332717896, "rewards/margins": 4.865624904632568, "rewards/rejected": -5.219531059265137, "step": 6700 }, { "epoch": 1.768581971534001, "grad_norm": 23.798090787847087, "learning_rate": 5.579204006325777e-07, "logits/chosen": 0.30564117431640625, "logits/rejected": -0.03124389611184597, "logps/chosen": -356.04998779296875, "logps/rejected": -339.3500061035156, "loss": 0.1066, "rewards/accuracies": 0.96875, "rewards/chosen": -0.45256346464157104, "rewards/margins": 4.703125, "rewards/rejected": -5.15625, "step": 6710 }, { "epoch": 1.7712177121771218, "grad_norm": 41.93439370892545, "learning_rate": 5.572614654717975e-07, "logits/chosen": 0.09632568061351776, "logits/rejected": -0.07755737006664276, "logps/chosen": -369.25, "logps/rejected": -385.20001220703125, "loss": 0.0888, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01870117150247097, "rewards/margins": 5.121874809265137, "rewards/rejected": -5.104687690734863, "step": 6720 }, { "epoch": 1.7738534528202425, "grad_norm": 8.073386833315709, "learning_rate": 5.566025303110173e-07, "logits/chosen": 0.08889160305261612, "logits/rejected": -0.09122924506664276, "logps/chosen": -435.0, "logps/rejected": -466.95001220703125, "loss": 0.0697, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11430664360523224, "rewards/margins": 5.587500095367432, "rewards/rejected": -5.471093654632568, "step": 6730 }, { "epoch": 1.7764891934633633, "grad_norm": 94.67599973098261, "learning_rate": 5.559435951502372e-07, "logits/chosen": -0.0557861328125, "logits/rejected": -0.05472106859087944, "logps/chosen": -336.70001220703125, "logps/rejected": -374.3500061035156, "loss": 0.0895, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.08682861179113388, "rewards/margins": 5.013281345367432, "rewards/rejected": -5.103125095367432, "step": 6740 }, { "epoch": 1.7791249341064839, "grad_norm": 40.74945565760971, "learning_rate": 5.55284659989457e-07, "logits/chosen": 0.1391914337873459, "logits/rejected": 0.05955810472369194, "logps/chosen": -366.70001220703125, "logps/rejected": -380.6499938964844, "loss": 0.0981, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09375, "rewards/margins": 5.201562404632568, "rewards/rejected": -5.293749809265137, "step": 6750 }, { "epoch": 1.7817606747496046, "grad_norm": 33.960476859925464, "learning_rate": 5.546257248286769e-07, "logits/chosen": 0.2479248046875, "logits/rejected": -0.06072998046875, "logps/chosen": -346.3500061035156, "logps/rejected": -373.75, "loss": 0.0997, "rewards/accuracies": 0.96875, "rewards/chosen": -0.30213624238967896, "rewards/margins": 4.7578125, "rewards/rejected": -5.059374809265137, "step": 6760 }, { "epoch": 1.7843964153927252, "grad_norm": 70.33006615704386, "learning_rate": 5.539667896678966e-07, "logits/chosen": 0.16053466498851776, "logits/rejected": 0.03416747972369194, "logps/chosen": -390.75, "logps/rejected": -384.20001220703125, "loss": 0.0944, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.37971192598342896, "rewards/margins": 4.896093845367432, "rewards/rejected": -5.276562690734863, "step": 6770 }, { "epoch": 1.787032156035846, "grad_norm": 31.451353105069973, "learning_rate": 5.533078545071165e-07, "logits/chosen": 0.07164306938648224, "logits/rejected": -0.09141235053539276, "logps/chosen": -378.54998779296875, "logps/rejected": -381.5, "loss": 0.1202, "rewards/accuracies": 0.96875, "rewards/chosen": 0.19985350966453552, "rewards/margins": 5.279687404632568, "rewards/rejected": -5.08203125, "step": 6780 }, { "epoch": 1.7896678966789668, "grad_norm": 44.97045950967553, "learning_rate": 5.526489193463363e-07, "logits/chosen": 0.06493835151195526, "logits/rejected": 0.03484649583697319, "logps/chosen": -382.3999938964844, "logps/rejected": -440.79998779296875, "loss": 0.0924, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04543457180261612, "rewards/margins": 5.322656154632568, "rewards/rejected": -5.271093845367432, "step": 6790 }, { "epoch": 1.7923036373220875, "grad_norm": 30.4790777030195, "learning_rate": 5.519899841855561e-07, "logits/chosen": -0.01068115234375, "logits/rejected": -0.14364013075828552, "logps/chosen": -353.29998779296875, "logps/rejected": -407.8999938964844, "loss": 0.0896, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.159423828125, "rewards/margins": 5.374218940734863, "rewards/rejected": -5.533593654632568, "step": 6800 }, { "epoch": 1.7949393779652083, "grad_norm": 75.44495157097266, "learning_rate": 5.513310490247759e-07, "logits/chosen": 0.07901611179113388, "logits/rejected": -0.14844055473804474, "logps/chosen": -322.8500061035156, "logps/rejected": -341.8500061035156, "loss": 0.1486, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7201293706893921, "rewards/margins": 4.505468845367432, "rewards/rejected": -5.23046875, "step": 6810 }, { "epoch": 1.797575118608329, "grad_norm": 28.460736486300537, "learning_rate": 5.506721138639958e-07, "logits/chosen": 0.19206543266773224, "logits/rejected": -0.05023193359375, "logps/chosen": -389.3999938964844, "logps/rejected": -380.04998779296875, "loss": 0.0904, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.25933837890625, "rewards/margins": 5.193749904632568, "rewards/rejected": -5.454687595367432, "step": 6820 }, { "epoch": 1.8002108592514496, "grad_norm": 47.5437263412305, "learning_rate": 5.500131787032156e-07, "logits/chosen": -0.06019287183880806, "logits/rejected": -0.14307861030101776, "logps/chosen": -381.8500061035156, "logps/rejected": -411.3999938964844, "loss": 0.1205, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.05161132663488388, "rewards/margins": 5.381249904632568, "rewards/rejected": -5.428906440734863, "step": 6830 }, { "epoch": 1.8028465998945704, "grad_norm": 21.53707213303082, "learning_rate": 5.493542435424355e-07, "logits/chosen": 0.05228271335363388, "logits/rejected": -0.0069946288131177425, "logps/chosen": -388.0, "logps/rejected": -391.20001220703125, "loss": 0.084, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05451660230755806, "rewards/margins": 5.169531345367432, "rewards/rejected": -5.227343559265137, "step": 6840 }, { "epoch": 1.805482340537691, "grad_norm": 53.04627267838241, "learning_rate": 5.486953083816552e-07, "logits/chosen": 0.19659423828125, "logits/rejected": 0.20417480170726776, "logps/chosen": -354.5, "logps/rejected": -387.75, "loss": 0.0982, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24494628608226776, "rewards/margins": 5.042187690734863, "rewards/rejected": -4.796093940734863, "step": 6850 }, { "epoch": 1.8081180811808117, "grad_norm": 49.940703154438566, "learning_rate": 5.480363732208751e-07, "logits/chosen": 0.19379273056983948, "logits/rejected": 0.05685615539550781, "logps/chosen": -392.8999938964844, "logps/rejected": -423.5, "loss": 0.083, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.50537109375, "rewards/margins": 5.514843940734863, "rewards/rejected": -5.008593559265137, "step": 6860 }, { "epoch": 1.8107538218239325, "grad_norm": 24.223593099305095, "learning_rate": 5.473774380600948e-07, "logits/chosen": 0.03692321851849556, "logits/rejected": 0.06120605394244194, "logps/chosen": -346.6499938964844, "logps/rejected": -409.0, "loss": 0.1223, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.13918456435203552, "rewards/margins": 4.849218845367432, "rewards/rejected": -4.708593845367432, "step": 6870 }, { "epoch": 1.8133895624670533, "grad_norm": 8.680869908242471, "learning_rate": 5.467185028993147e-07, "logits/chosen": 0.11723633110523224, "logits/rejected": -0.02109985426068306, "logps/chosen": -380.45001220703125, "logps/rejected": -422.20001220703125, "loss": 0.1055, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16889648139476776, "rewards/margins": 5.391406059265137, "rewards/rejected": -5.224999904632568, "step": 6880 }, { "epoch": 1.816025303110174, "grad_norm": 16.667798806827367, "learning_rate": 5.460595677385344e-07, "logits/chosen": 0.1302490234375, "logits/rejected": -0.06365966796875, "logps/chosen": -369.54998779296875, "logps/rejected": -399.1000061035156, "loss": 0.0753, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01701660081744194, "rewards/margins": 5.221875190734863, "rewards/rejected": -5.240624904632568, "step": 6890 }, { "epoch": 1.8186610437532946, "grad_norm": 13.899175269188522, "learning_rate": 5.454006325777544e-07, "logits/chosen": 0.15651245415210724, "logits/rejected": -0.07255401462316513, "logps/chosen": -382.04998779296875, "logps/rejected": -399.6499938964844, "loss": 0.1376, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04658203199505806, "rewards/margins": 5.064062595367432, "rewards/rejected": -5.020312309265137, "step": 6900 }, { "epoch": 1.8212967843964154, "grad_norm": 21.566012981721055, "learning_rate": 5.447416974169742e-07, "logits/chosen": 0.2139846831560135, "logits/rejected": 0.06038818508386612, "logps/chosen": -357.5, "logps/rejected": -417.79998779296875, "loss": 0.0943, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0018554687267169356, "rewards/margins": 5.0234375, "rewards/rejected": -5.01953125, "step": 6910 }, { "epoch": 1.823932525039536, "grad_norm": 43.91293004868379, "learning_rate": 5.44082762256194e-07, "logits/chosen": 0.16796875, "logits/rejected": 0.04240112379193306, "logps/chosen": -395.1000061035156, "logps/rejected": -428.6000061035156, "loss": 0.0891, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.15703125298023224, "rewards/margins": 5.321875095367432, "rewards/rejected": -5.4765625, "step": 6920 }, { "epoch": 1.8265682656826567, "grad_norm": 37.74186745422406, "learning_rate": 5.434238270954138e-07, "logits/chosen": 0.17558594048023224, "logits/rejected": -0.12793579697608948, "logps/chosen": -407.25, "logps/rejected": -422.0, "loss": 0.0782, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3949829041957855, "rewards/margins": 5.293749809265137, "rewards/rejected": -5.685937404632568, "step": 6930 }, { "epoch": 1.8292040063257775, "grad_norm": 10.831250703546516, "learning_rate": 5.427648919346335e-07, "logits/chosen": 0.115447998046875, "logits/rejected": -0.11520691215991974, "logps/chosen": -365.1000061035156, "logps/rejected": -371.1000061035156, "loss": 0.1225, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.833544909954071, "rewards/margins": 4.628125190734863, "rewards/rejected": -5.46484375, "step": 6940 }, { "epoch": 1.8318397469688983, "grad_norm": 10.072209413876259, "learning_rate": 5.421059567738534e-07, "logits/chosen": 0.2562011778354645, "logits/rejected": 0.08948364108800888, "logps/chosen": -398.1499938964844, "logps/rejected": -439.70001220703125, "loss": 0.0913, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.15911865234375, "rewards/margins": 5.510937690734863, "rewards/rejected": -5.667187690734863, "step": 6950 }, { "epoch": 1.834475487612019, "grad_norm": 90.99449267695972, "learning_rate": 5.414470216130733e-07, "logits/chosen": 0.20668944716453552, "logits/rejected": 0.15158692002296448, "logps/chosen": -323.20001220703125, "logps/rejected": -363.95001220703125, "loss": 0.1329, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0015991211403161287, "rewards/margins": 5.271874904632568, "rewards/rejected": -5.270312309265137, "step": 6960 }, { "epoch": 1.8371112282551398, "grad_norm": 48.45573685551274, "learning_rate": 5.407880864522931e-07, "logits/chosen": 0.21143189072608948, "logits/rejected": 0.0082244873046875, "logps/chosen": -416.79998779296875, "logps/rejected": -425.75, "loss": 0.0505, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24912719428539276, "rewards/margins": 5.618750095367432, "rewards/rejected": -5.372656345367432, "step": 6970 }, { "epoch": 1.8397469688982604, "grad_norm": 14.712337535441145, "learning_rate": 5.40129151291513e-07, "logits/chosen": 0.07244873046875, "logits/rejected": -0.05002746731042862, "logps/chosen": -426.20001220703125, "logps/rejected": -450.70001220703125, "loss": 0.084, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3093322813510895, "rewards/margins": 5.685156345367432, "rewards/rejected": -5.378125190734863, "step": 6980 }, { "epoch": 1.8423827095413812, "grad_norm": 49.837108142547216, "learning_rate": 5.394702161307327e-07, "logits/chosen": 0.11162109673023224, "logits/rejected": 0.000244140625, "logps/chosen": -364.5, "logps/rejected": -353.04998779296875, "loss": 0.0931, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2624267637729645, "rewards/margins": 4.961718559265137, "rewards/rejected": -4.69921875, "step": 6990 }, { "epoch": 1.8450184501845017, "grad_norm": 15.419982636332678, "learning_rate": 5.388112809699525e-07, "logits/chosen": 0.3532958924770355, "logits/rejected": 0.0927581787109375, "logps/chosen": -368.8500061035156, "logps/rejected": -389.70001220703125, "loss": 0.1447, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16328124701976776, "rewards/margins": 5.254687309265137, "rewards/rejected": -5.089062690734863, "step": 7000 }, { "epoch": 1.8476541908276225, "grad_norm": 22.863923173310674, "learning_rate": 5.381523458091724e-07, "logits/chosen": 0.24951477348804474, "logits/rejected": -0.02238159254193306, "logps/chosen": -355.8500061035156, "logps/rejected": -391.6000061035156, "loss": 0.0662, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.02971191331744194, "rewards/margins": 5.25390625, "rewards/rejected": -5.225781440734863, "step": 7010 }, { "epoch": 1.8502899314707433, "grad_norm": 15.042450425447203, "learning_rate": 5.374934106483921e-07, "logits/chosen": 0.08229980617761612, "logits/rejected": 0.0034057616721838713, "logps/chosen": -383.6499938964844, "logps/rejected": -406.29998779296875, "loss": 0.1101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.25322264432907104, "rewards/margins": 4.967187404632568, "rewards/rejected": -5.221093654632568, "step": 7020 }, { "epoch": 1.852925672113864, "grad_norm": 44.01991304005375, "learning_rate": 5.36834475487612e-07, "logits/chosen": 0.29359132051467896, "logits/rejected": 0.11154785007238388, "logps/chosen": -353.1000061035156, "logps/rejected": -344.20001220703125, "loss": 0.1352, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.134246826171875, "rewards/margins": 4.739843845367432, "rewards/rejected": -4.875, "step": 7030 }, { "epoch": 1.8555614127569848, "grad_norm": 9.42960079746072, "learning_rate": 5.361755403268317e-07, "logits/chosen": 0.05121765285730362, "logits/rejected": -0.22086334228515625, "logps/chosen": -397.1499938964844, "logps/rejected": -392.20001220703125, "loss": 0.0674, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.05246581882238388, "rewards/margins": 5.704687595367432, "rewards/rejected": -5.658593654632568, "step": 7040 }, { "epoch": 1.8581971534001054, "grad_norm": 52.560877674663395, "learning_rate": 5.355166051660517e-07, "logits/chosen": 0.22047729790210724, "logits/rejected": 0.07453612983226776, "logps/chosen": -396.04998779296875, "logps/rejected": -391.04998779296875, "loss": 0.1313, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.00274658203125, "rewards/margins": 5.239062309265137, "rewards/rejected": -5.241406440734863, "step": 7050 }, { "epoch": 1.8608328940432262, "grad_norm": 22.329378244554043, "learning_rate": 5.348576700052715e-07, "logits/chosen": 0.12152099609375, "logits/rejected": -0.04368286207318306, "logps/chosen": -343.54998779296875, "logps/rejected": -394.3999938964844, "loss": 0.0805, "rewards/accuracies": 0.96875, "rewards/chosen": -0.21542663872241974, "rewards/margins": 5.271874904632568, "rewards/rejected": -5.483593940734863, "step": 7060 }, { "epoch": 1.8634686346863467, "grad_norm": 46.148342438128154, "learning_rate": 5.341987348444913e-07, "logits/chosen": -0.03294067457318306, "logits/rejected": -0.14821776747703552, "logps/chosen": -350.3500061035156, "logps/rejected": -395.3999938964844, "loss": 0.0665, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.127288818359375, "rewards/margins": 5.563281059265137, "rewards/rejected": -5.435156345367432, "step": 7070 }, { "epoch": 1.8661043753294675, "grad_norm": 111.44235232711007, "learning_rate": 5.335397996837111e-07, "logits/chosen": 0.08922119438648224, "logits/rejected": -0.2972412109375, "logps/chosen": -365.95001220703125, "logps/rejected": -394.20001220703125, "loss": 0.1341, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.553759753704071, "rewards/margins": 5.34375, "rewards/rejected": -5.895312309265137, "step": 7080 }, { "epoch": 1.8687401159725883, "grad_norm": 19.885408312798084, "learning_rate": 5.328808645229309e-07, "logits/chosen": 0.0452880859375, "logits/rejected": -0.0417327880859375, "logps/chosen": -371.54998779296875, "logps/rejected": -408.3500061035156, "loss": 0.1436, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1754150390625, "rewards/margins": 5.178124904632568, "rewards/rejected": -5.352343559265137, "step": 7090 }, { "epoch": 1.871375856615709, "grad_norm": 24.377193891862458, "learning_rate": 5.322219293621507e-07, "logits/chosen": -0.01462402381002903, "logits/rejected": -0.13770751655101776, "logps/chosen": -388.0, "logps/rejected": -388.3500061035156, "loss": 0.098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10629882663488388, "rewards/margins": 5.012499809265137, "rewards/rejected": -4.903906345367432, "step": 7100 }, { "epoch": 1.8740115972588298, "grad_norm": 42.81035937487998, "learning_rate": 5.315629942013705e-07, "logits/chosen": 0.11722411960363388, "logits/rejected": -0.04685058444738388, "logps/chosen": -320.75, "logps/rejected": -356.70001220703125, "loss": 0.1315, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21416015923023224, "rewards/margins": 4.83984375, "rewards/rejected": -4.62890625, "step": 7110 }, { "epoch": 1.8766473379019506, "grad_norm": 12.432595786944667, "learning_rate": 5.309040590405904e-07, "logits/chosen": 0.07854004204273224, "logits/rejected": -0.14202269911766052, "logps/chosen": -376.25, "logps/rejected": -401.6000061035156, "loss": 0.094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.09965209662914276, "rewards/margins": 5.221093654632568, "rewards/rejected": -5.317187309265137, "step": 7120 }, { "epoch": 1.8792830785450712, "grad_norm": 32.51099050580076, "learning_rate": 5.302451238798103e-07, "logits/chosen": -0.09785155951976776, "logits/rejected": -0.10318603366613388, "logps/chosen": -354.0, "logps/rejected": -377.1000061035156, "loss": 0.1138, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.1712646484375, "rewards/margins": 5.036718845367432, "rewards/rejected": -5.205468654632568, "step": 7130 }, { "epoch": 1.881918819188192, "grad_norm": 39.07155811756426, "learning_rate": 5.2958618871903e-07, "logits/chosen": -0.07442016899585724, "logits/rejected": -0.1380874663591385, "logps/chosen": -344.1000061035156, "logps/rejected": -353.1499938964844, "loss": 0.1145, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.453125, "rewards/margins": 5.045312404632568, "rewards/rejected": -4.592968940734863, "step": 7140 }, { "epoch": 1.8845545598313125, "grad_norm": 12.643547147715092, "learning_rate": 5.289272535582499e-07, "logits/chosen": 0.006103515625, "logits/rejected": -0.07357177883386612, "logps/chosen": -378.70001220703125, "logps/rejected": -400.04998779296875, "loss": 0.1148, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3124755918979645, "rewards/margins": 5.366406440734863, "rewards/rejected": -5.057031154632568, "step": 7150 }, { "epoch": 1.8871903004744333, "grad_norm": 30.772713600312862, "learning_rate": 5.282683183974696e-07, "logits/chosen": 0.23129883408546448, "logits/rejected": -0.01693115197122097, "logps/chosen": -383.6499938964844, "logps/rejected": -349.54998779296875, "loss": 0.1071, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3979248106479645, "rewards/margins": 4.76171875, "rewards/rejected": -4.366406440734863, "step": 7160 }, { "epoch": 1.889826041117554, "grad_norm": 52.28710441161794, "learning_rate": 5.276093832366895e-07, "logits/chosen": 0.16884613037109375, "logits/rejected": -0.02609863318502903, "logps/chosen": -375.8500061035156, "logps/rejected": -417.54998779296875, "loss": 0.1043, "rewards/accuracies": 0.96875, "rewards/chosen": 0.45631104707717896, "rewards/margins": 5.609375, "rewards/rejected": -5.155468940734863, "step": 7170 }, { "epoch": 1.8924617817606748, "grad_norm": 53.20190875453721, "learning_rate": 5.269504480759093e-07, "logits/chosen": 0.16245117783546448, "logits/rejected": 0.04493103176355362, "logps/chosen": -381.875, "logps/rejected": -386.20001220703125, "loss": 0.0955, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.38111573457717896, "rewards/margins": 5.259375095367432, "rewards/rejected": -4.877343654632568, "step": 7180 }, { "epoch": 1.8950975224037956, "grad_norm": 65.86063292961212, "learning_rate": 5.262915129151291e-07, "logits/chosen": -0.04451904445886612, "logits/rejected": -0.134033203125, "logps/chosen": -357.29998779296875, "logps/rejected": -386.6499938964844, "loss": 0.1183, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0777587890625, "rewards/margins": 4.918749809265137, "rewards/rejected": -4.99609375, "step": 7190 }, { "epoch": 1.8977332630469161, "grad_norm": 27.254510158268296, "learning_rate": 5.25632577754349e-07, "logits/chosen": 0.077301025390625, "logits/rejected": -0.1099853515625, "logps/chosen": -349.6000061035156, "logps/rejected": -404.0, "loss": 0.0883, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.16030272841453552, "rewards/margins": 5.709374904632568, "rewards/rejected": -5.553906440734863, "step": 7200 }, { "epoch": 1.900369003690037, "grad_norm": 103.59607124333075, "learning_rate": 5.249736425935688e-07, "logits/chosen": 0.24826964735984802, "logits/rejected": -0.13939818739891052, "logps/chosen": -386.0, "logps/rejected": -379.0, "loss": 0.0935, "rewards/accuracies": 0.96875, "rewards/chosen": 0.30575257539749146, "rewards/margins": 5.169531345367432, "rewards/rejected": -4.866406440734863, "step": 7210 }, { "epoch": 1.9030047443331575, "grad_norm": 39.741741810770876, "learning_rate": 5.243147074327886e-07, "logits/chosen": 0.15179443359375, "logits/rejected": -0.05086364597082138, "logps/chosen": -399.20001220703125, "logps/rejected": -386.54998779296875, "loss": 0.0967, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18739013373851776, "rewards/margins": 5.112500190734863, "rewards/rejected": -4.922656059265137, "step": 7220 }, { "epoch": 1.9056404849762782, "grad_norm": 31.048688557539467, "learning_rate": 5.236557722720084e-07, "logits/chosen": 0.0384521484375, "logits/rejected": -0.19309082627296448, "logps/chosen": -389.8500061035156, "logps/rejected": -370.20001220703125, "loss": 0.0796, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.835498034954071, "rewards/margins": 5.28515625, "rewards/rejected": -4.449999809265137, "step": 7230 }, { "epoch": 1.908276225619399, "grad_norm": 64.42661603117759, "learning_rate": 5.229968371112282e-07, "logits/chosen": 0.13054199516773224, "logits/rejected": 0.0066009522415697575, "logps/chosen": -371.6499938964844, "logps/rejected": -378.54998779296875, "loss": 0.119, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.647290050983429, "rewards/margins": 5.038281440734863, "rewards/rejected": -4.39453125, "step": 7240 }, { "epoch": 1.9109119662625198, "grad_norm": 17.787457917855555, "learning_rate": 5.223379019504481e-07, "logits/chosen": 0.2780395448207855, "logits/rejected": 0.13046875596046448, "logps/chosen": -355.1499938964844, "logps/rejected": -411.70001220703125, "loss": 0.0863, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.42277830839157104, "rewards/margins": 4.9765625, "rewards/rejected": -4.553124904632568, "step": 7250 }, { "epoch": 1.9135477069056406, "grad_norm": 54.95834687364892, "learning_rate": 5.216789667896678e-07, "logits/chosen": 0.14943543076515198, "logits/rejected": -0.036466218531131744, "logps/chosen": -351.75, "logps/rejected": -379.54998779296875, "loss": 0.0959, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6358642578125, "rewards/margins": 5.390625, "rewards/rejected": -4.756249904632568, "step": 7260 }, { "epoch": 1.9161834475487614, "grad_norm": 77.13738845569536, "learning_rate": 5.210200316288878e-07, "logits/chosen": -0.09049072116613388, "logits/rejected": -0.15340271592140198, "logps/chosen": -361.04998779296875, "logps/rejected": -378.8999938964844, "loss": 0.1152, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.33822327852249146, "rewards/margins": 5.102343559265137, "rewards/rejected": -4.767187595367432, "step": 7270 }, { "epoch": 1.918819188191882, "grad_norm": 12.285946515766978, "learning_rate": 5.203610964681075e-07, "logits/chosen": 0.03779296949505806, "logits/rejected": 0.08008422702550888, "logps/chosen": -381.45001220703125, "logps/rejected": -431.20001220703125, "loss": 0.0888, "rewards/accuracies": 0.96875, "rewards/chosen": 0.281982421875, "rewards/margins": 5.642187595367432, "rewards/rejected": -5.357812404632568, "step": 7280 }, { "epoch": 1.9214549288350027, "grad_norm": 37.620923136426434, "learning_rate": 5.197021613073274e-07, "logits/chosen": 0.07078857719898224, "logits/rejected": -0.15585632622241974, "logps/chosen": -357.8500061035156, "logps/rejected": -383.3500061035156, "loss": 0.1437, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.004467773251235485, "rewards/margins": 5.131249904632568, "rewards/rejected": -5.1328125, "step": 7290 }, { "epoch": 1.9240906694781232, "grad_norm": 14.148956773888223, "learning_rate": 5.190432261465472e-07, "logits/chosen": 0.01101074181497097, "logits/rejected": -0.01772460900247097, "logps/chosen": -386.1000061035156, "logps/rejected": -417.8999938964844, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25751954317092896, "rewards/margins": 5.428124904632568, "rewards/rejected": -5.167187690734863, "step": 7300 }, { "epoch": 1.926726410121244, "grad_norm": 47.52018595872532, "learning_rate": 5.18384290985767e-07, "logits/chosen": 0.07183227688074112, "logits/rejected": -0.11600341647863388, "logps/chosen": -445.6499938964844, "logps/rejected": -413.29998779296875, "loss": 0.0684, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.19666138291358948, "rewards/margins": 5.416406154632568, "rewards/rejected": -5.217968940734863, "step": 7310 }, { "epoch": 1.9293621507643648, "grad_norm": 34.12743036312071, "learning_rate": 5.177253558249868e-07, "logits/chosen": -0.03858032077550888, "logits/rejected": -0.08576049655675888, "logps/chosen": -356.79998779296875, "logps/rejected": -401.70001220703125, "loss": 0.0861, "rewards/accuracies": 0.96875, "rewards/chosen": -0.32069700956344604, "rewards/margins": 5.379687309265137, "rewards/rejected": -5.696875095367432, "step": 7320 }, { "epoch": 1.9319978914074856, "grad_norm": 61.8534449912025, "learning_rate": 5.170664206642065e-07, "logits/chosen": 0.05475158616900444, "logits/rejected": -0.3258422911167145, "logps/chosen": -351.6499938964844, "logps/rejected": -366.8500061035156, "loss": 0.124, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.6774963140487671, "rewards/margins": 4.889062404632568, "rewards/rejected": -5.563281059265137, "step": 7330 }, { "epoch": 1.9346336320506063, "grad_norm": 17.493997802232915, "learning_rate": 5.164074855034264e-07, "logits/chosen": 0.15594482421875, "logits/rejected": -0.09357909858226776, "logps/chosen": -360.54998779296875, "logps/rejected": -370.70001220703125, "loss": 0.1068, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3066162168979645, "rewards/margins": 4.858593940734863, "rewards/rejected": -5.164843559265137, "step": 7340 }, { "epoch": 1.937269372693727, "grad_norm": 119.8194488605298, "learning_rate": 5.157485503426464e-07, "logits/chosen": 0.05842285230755806, "logits/rejected": -0.13516846299171448, "logps/chosen": -397.45001220703125, "logps/rejected": -396.29998779296875, "loss": 0.1186, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2866760194301605, "rewards/margins": 5.046875, "rewards/rejected": -5.329687595367432, "step": 7350 }, { "epoch": 1.9399051133368477, "grad_norm": 43.93044206169433, "learning_rate": 5.150896151818661e-07, "logits/chosen": -0.03166503831744194, "logits/rejected": -0.20626220107078552, "logps/chosen": -341.95001220703125, "logps/rejected": -357.3500061035156, "loss": 0.0767, "rewards/accuracies": 0.96875, "rewards/chosen": -0.06351318210363388, "rewards/margins": 5.42578125, "rewards/rejected": -5.486718654632568, "step": 7360 }, { "epoch": 1.9425408539799682, "grad_norm": 19.606244351318427, "learning_rate": 5.144306800210859e-07, "logits/chosen": 0.09937896579504013, "logits/rejected": -0.15078124403953552, "logps/chosen": -397.25, "logps/rejected": -395.3500061035156, "loss": 0.0767, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.07947997748851776, "rewards/margins": 5.118750095367432, "rewards/rejected": -5.0390625, "step": 7370 }, { "epoch": 1.945176594623089, "grad_norm": 50.64725942550061, "learning_rate": 5.137717448603057e-07, "logits/chosen": 0.17451782524585724, "logits/rejected": -0.0352783203125, "logps/chosen": -350.8500061035156, "logps/rejected": -343.8999938964844, "loss": 0.1158, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1470489501953125, "rewards/margins": 4.999218940734863, "rewards/rejected": -4.85546875, "step": 7380 }, { "epoch": 1.9478123352662098, "grad_norm": 75.91461825749217, "learning_rate": 5.131128096995255e-07, "logits/chosen": 0.135894775390625, "logits/rejected": 0.0082244873046875, "logps/chosen": -373.54998779296875, "logps/rejected": -430.6000061035156, "loss": 0.087, "rewards/accuracies": 0.96875, "rewards/chosen": -0.19808349013328552, "rewards/margins": 4.883593559265137, "rewards/rejected": -5.079687595367432, "step": 7390 }, { "epoch": 1.9504480759093306, "grad_norm": 31.286460041644723, "learning_rate": 5.124538745387453e-07, "logits/chosen": 0.04711303859949112, "logits/rejected": -0.02321777306497097, "logps/chosen": -377.45001220703125, "logps/rejected": -388.04998779296875, "loss": 0.0884, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0562744140625, "rewards/margins": 5.224999904632568, "rewards/rejected": -5.282812595367432, "step": 7400 }, { "epoch": 1.9530838165524513, "grad_norm": 34.94813703033813, "learning_rate": 5.117949393779651e-07, "logits/chosen": -0.09428100287914276, "logits/rejected": -0.15581054985523224, "logps/chosen": -370.29998779296875, "logps/rejected": -429.3999938964844, "loss": 0.0816, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.140869140625, "rewards/margins": 5.459374904632568, "rewards/rejected": -5.603125095367432, "step": 7410 }, { "epoch": 1.9557195571955721, "grad_norm": 20.921862489649982, "learning_rate": 5.111360042171851e-07, "logits/chosen": -0.07819823920726776, "logits/rejected": -0.23270264267921448, "logps/chosen": -409.75, "logps/rejected": -424.3999938964844, "loss": 0.1166, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4489502012729645, "rewards/margins": 5.379687309265137, "rewards/rejected": -5.825781345367432, "step": 7420 }, { "epoch": 1.9583552978386927, "grad_norm": 15.420149973533343, "learning_rate": 5.104770690564048e-07, "logits/chosen": -0.15227051079273224, "logits/rejected": -0.20051269233226776, "logps/chosen": -345.20001220703125, "logps/rejected": -435.20001220703125, "loss": 0.0824, "rewards/accuracies": 0.96875, "rewards/chosen": -0.04433593899011612, "rewards/margins": 5.940625190734863, "rewards/rejected": -5.985937595367432, "step": 7430 }, { "epoch": 1.9609910384818134, "grad_norm": 36.460303766146225, "learning_rate": 5.098181338956247e-07, "logits/chosen": 0.0971832275390625, "logits/rejected": -0.17537841200828552, "logps/chosen": -401.1499938964844, "logps/rejected": -423.70001220703125, "loss": 0.0892, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3322387635707855, "rewards/margins": 5.98828125, "rewards/rejected": -5.651562690734863, "step": 7440 }, { "epoch": 1.963626779124934, "grad_norm": 37.629305614991075, "learning_rate": 5.091591987348444e-07, "logits/chosen": 0.15980835258960724, "logits/rejected": 0.10205078125, "logps/chosen": -357.04998779296875, "logps/rejected": -368.3999938964844, "loss": 0.0745, "rewards/accuracies": 0.96875, "rewards/chosen": -0.12355957180261612, "rewards/margins": 5.11328125, "rewards/rejected": -5.235937595367432, "step": 7450 }, { "epoch": 1.9662625197680548, "grad_norm": 46.94970955181034, "learning_rate": 5.085002635740643e-07, "logits/chosen": 0.099029541015625, "logits/rejected": 0.0018798827659338713, "logps/chosen": -324.8999938964844, "logps/rejected": -350.04998779296875, "loss": 0.0773, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.33984375, "rewards/margins": 5.23046875, "rewards/rejected": -4.890625, "step": 7460 }, { "epoch": 1.9688982604111755, "grad_norm": 54.750037693626666, "learning_rate": 5.078413284132841e-07, "logits/chosen": 0.04892577975988388, "logits/rejected": -0.09995117038488388, "logps/chosen": -370.3500061035156, "logps/rejected": -400.45001220703125, "loss": 0.1287, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.2971435487270355, "rewards/margins": 4.935156345367432, "rewards/rejected": -4.640625, "step": 7470 }, { "epoch": 1.9715340010542963, "grad_norm": 36.59872980381171, "learning_rate": 5.071823932525039e-07, "logits/chosen": 0.09415283054113388, "logits/rejected": -0.04686889797449112, "logps/chosen": -392.54998779296875, "logps/rejected": -382.25, "loss": 0.0893, "rewards/accuracies": 0.96875, "rewards/chosen": 0.672412097454071, "rewards/margins": 5.19140625, "rewards/rejected": -4.521093845367432, "step": 7480 }, { "epoch": 1.974169741697417, "grad_norm": 31.62579828952649, "learning_rate": 5.065234580917237e-07, "logits/chosen": 0.09796142578125, "logits/rejected": -0.0033203125931322575, "logps/chosen": -342.0, "logps/rejected": -356.79998779296875, "loss": 0.1183, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5767577886581421, "rewards/margins": 5.275781154632568, "rewards/rejected": -4.700781345367432, "step": 7490 }, { "epoch": 1.9768054823405377, "grad_norm": 39.58793262441074, "learning_rate": 5.058645229309436e-07, "logits/chosen": 0.10877685248851776, "logits/rejected": -0.14938965439796448, "logps/chosen": -373.95001220703125, "logps/rejected": -365.6499938964844, "loss": 0.0984, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.593945324420929, "rewards/margins": 5.131249904632568, "rewards/rejected": -4.536718845367432, "step": 7500 }, { "epoch": 1.9794412229836584, "grad_norm": 29.658008489459455, "learning_rate": 5.052055877701634e-07, "logits/chosen": 0.197540283203125, "logits/rejected": -0.0004150390741415322, "logps/chosen": -287.6499938964844, "logps/rejected": -337.1499938964844, "loss": 0.0972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24345703423023224, "rewards/margins": 4.942187309265137, "rewards/rejected": -4.694531440734863, "step": 7510 }, { "epoch": 1.982076963626779, "grad_norm": 41.66734304393667, "learning_rate": 5.045466526093833e-07, "logits/chosen": 0.03653564304113388, "logits/rejected": -0.0061706542037427425, "logps/chosen": -381.1499938964844, "logps/rejected": -401.54998779296875, "loss": 0.0677, "rewards/accuracies": 0.96875, "rewards/chosen": 0.44038087129592896, "rewards/margins": 5.802343845367432, "rewards/rejected": -5.370312690734863, "step": 7520 }, { "epoch": 1.9847127042698998, "grad_norm": 57.191052757445924, "learning_rate": 5.03887717448603e-07, "logits/chosen": -0.049468994140625, "logits/rejected": -0.13028565049171448, "logps/chosen": -344.6499938964844, "logps/rejected": -384.25, "loss": 0.1051, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.12685546278953552, "rewards/margins": 5.102343559265137, "rewards/rejected": -5.228125095367432, "step": 7530 }, { "epoch": 1.9873484449130205, "grad_norm": 25.73762406762995, "learning_rate": 5.032287822878229e-07, "logits/chosen": 0.04049377515912056, "logits/rejected": -0.07585449516773224, "logps/chosen": -357.6499938964844, "logps/rejected": -413.5, "loss": 0.0907, "rewards/accuracies": 0.96875, "rewards/chosen": -0.01529541052877903, "rewards/margins": 5.435156345367432, "rewards/rejected": -5.453906059265137, "step": 7540 }, { "epoch": 1.9899841855561413, "grad_norm": 52.19019119840338, "learning_rate": 5.025698471270426e-07, "logits/chosen": 0.03395996242761612, "logits/rejected": -0.20601196587085724, "logps/chosen": -399.8999938964844, "logps/rejected": -427.5, "loss": 0.0697, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.07182617485523224, "rewards/margins": 5.86328125, "rewards/rejected": -5.7890625, "step": 7550 }, { "epoch": 1.992619926199262, "grad_norm": 27.568146085879615, "learning_rate": 5.019109119662625e-07, "logits/chosen": -0.05274658277630806, "logits/rejected": -0.0321044921875, "logps/chosen": -358.8500061035156, "logps/rejected": -439.1000061035156, "loss": 0.104, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.03510131686925888, "rewards/margins": 5.478906154632568, "rewards/rejected": -5.440625190734863, "step": 7560 }, { "epoch": 1.9952556668423829, "grad_norm": 45.01994059928831, "learning_rate": 5.012519768054824e-07, "logits/chosen": 0.05794677883386612, "logits/rejected": -0.18687744438648224, "logps/chosen": -398.6499938964844, "logps/rejected": -388.79998779296875, "loss": 0.0849, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3701415956020355, "rewards/margins": 5.246874809265137, "rewards/rejected": -4.878125190734863, "step": 7570 }, { "epoch": 1.9978914074855034, "grad_norm": 11.864455625648503, "learning_rate": 5.005930416447022e-07, "logits/chosen": 0.07451172173023224, "logits/rejected": -0.02122192457318306, "logps/chosen": -365.8999938964844, "logps/rejected": -399.45001220703125, "loss": 0.1056, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.12521973252296448, "rewards/margins": 5.332812309265137, "rewards/rejected": -5.208593845367432, "step": 7580 }, { "epoch": 2.000527148128624, "grad_norm": 7.024520250291432, "learning_rate": 4.99934106483922e-07, "logits/chosen": 0.06009521335363388, "logits/rejected": -0.19587402045726776, "logps/chosen": -365.1499938964844, "logps/rejected": -395.54998779296875, "loss": 0.1043, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.26368409395217896, "rewards/margins": 5.842968940734863, "rewards/rejected": -5.580468654632568, "step": 7590 }, { "epoch": 2.0031628887717448, "grad_norm": 8.866195082555484, "learning_rate": 4.992751713231418e-07, "logits/chosen": -0.04547119140625, "logits/rejected": -0.26494139432907104, "logps/chosen": -366.04998779296875, "logps/rejected": -413.29998779296875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 0.2513427734375, "rewards/margins": 6.603125095367432, "rewards/rejected": -6.3515625, "step": 7600 }, { "epoch": 2.0057986294148655, "grad_norm": 4.3834153071073585, "learning_rate": 4.986162361623616e-07, "logits/chosen": -0.07100830227136612, "logits/rejected": -0.2622436583042145, "logps/chosen": -367.54998779296875, "logps/rejected": -403.5, "loss": 0.029, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.35185545682907104, "rewards/margins": 6.553124904632568, "rewards/rejected": -6.90625, "step": 7610 }, { "epoch": 2.0084343700579863, "grad_norm": 42.05252911680176, "learning_rate": 4.979573010015814e-07, "logits/chosen": -0.2911621034145355, "logits/rejected": -0.3461547791957855, "logps/chosen": -406.3999938964844, "logps/rejected": -437.1000061035156, "loss": 0.0147, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.23227539658546448, "rewards/margins": 7.010937690734863, "rewards/rejected": -7.245312690734863, "step": 7620 }, { "epoch": 2.011070110701107, "grad_norm": 9.65476787709062, "learning_rate": 4.972983658408013e-07, "logits/chosen": -0.15788574516773224, "logits/rejected": -0.45555418729782104, "logps/chosen": -412.6499938964844, "logps/rejected": -446.29998779296875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.15603026747703552, "rewards/margins": 7.571875095367432, "rewards/rejected": -7.737500190734863, "step": 7630 }, { "epoch": 2.013705851344228, "grad_norm": 7.7201137617126, "learning_rate": 4.966394306800211e-07, "logits/chosen": -0.12974853813648224, "logits/rejected": -0.513427734375, "logps/chosen": -408.6499938964844, "logps/rejected": -445.45001220703125, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.7552490234375, "rewards/margins": 7.982812404632568, "rewards/rejected": -8.739062309265137, "step": 7640 }, { "epoch": 2.0163415919873486, "grad_norm": 4.35295882070041, "learning_rate": 4.959804955192409e-07, "logits/chosen": -0.25953370332717896, "logits/rejected": -0.45561522245407104, "logps/chosen": -369.8999938964844, "logps/rejected": -407.25, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4035155773162842, "rewards/margins": 6.954687595367432, "rewards/rejected": -8.357812881469727, "step": 7650 }, { "epoch": 2.018977332630469, "grad_norm": 14.977283063585324, "learning_rate": 4.953215603584608e-07, "logits/chosen": -0.292105108499527, "logits/rejected": -0.560791015625, "logps/chosen": -382.45001220703125, "logps/rejected": -441.8999938964844, "loss": 0.0235, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.971630871295929, "rewards/margins": 7.604687690734863, "rewards/rejected": -8.581250190734863, "step": 7660 }, { "epoch": 2.0216130732735897, "grad_norm": 4.532020901269925, "learning_rate": 4.946626251976805e-07, "logits/chosen": -0.17230224609375, "logits/rejected": -0.41367799043655396, "logps/chosen": -372.25, "logps/rejected": -421.20001220703125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -0.590039074420929, "rewards/margins": 7.428124904632568, "rewards/rejected": -8.018750190734863, "step": 7670 }, { "epoch": 2.0242488139167105, "grad_norm": 2.1069841996523975, "learning_rate": 4.940036900369003e-07, "logits/chosen": -0.17882385849952698, "logits/rejected": -0.347930908203125, "logps/chosen": -355.3999938964844, "logps/rejected": -421.70001220703125, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -0.5118652582168579, "rewards/margins": 7.723437309265137, "rewards/rejected": -8.2421875, "step": 7680 }, { "epoch": 2.0268845545598313, "grad_norm": 6.263376585021282, "learning_rate": 4.933447548761202e-07, "logits/chosen": -0.22382812201976776, "logits/rejected": -0.62066650390625, "logps/chosen": -339.8999938964844, "logps/rejected": -427.3999938964844, "loss": 0.012, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9144531488418579, "rewards/margins": 7.453125, "rewards/rejected": -8.364062309265137, "step": 7690 }, { "epoch": 2.029520295202952, "grad_norm": 7.596103057828784, "learning_rate": 4.9268581971534e-07, "logits/chosen": -0.22834472358226776, "logits/rejected": -0.4796508848667145, "logps/chosen": -343.5, "logps/rejected": -380.0, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -0.938098132610321, "rewards/margins": 7.314062595367432, "rewards/rejected": -8.243749618530273, "step": 7700 }, { "epoch": 2.032156035846073, "grad_norm": 2.337252491741043, "learning_rate": 4.920268845545598e-07, "logits/chosen": -0.17890624701976776, "logits/rejected": -0.28132325410842896, "logps/chosen": -330.75, "logps/rejected": -415.1000061035156, "loss": 0.0314, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.76959228515625, "rewards/margins": 7.032812595367432, "rewards/rejected": -7.8046875, "step": 7710 }, { "epoch": 2.0347917764891936, "grad_norm": 3.5990686638308538, "learning_rate": 4.913679493937796e-07, "logits/chosen": -0.21873779594898224, "logits/rejected": -0.48545533418655396, "logps/chosen": -379.0, "logps/rejected": -405.6499938964844, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8357909917831421, "rewards/margins": 7.2578125, "rewards/rejected": -8.0859375, "step": 7720 }, { "epoch": 2.037427517132314, "grad_norm": 2.966518735518197, "learning_rate": 4.907090142329994e-07, "logits/chosen": -0.2493896484375, "logits/rejected": -0.49345701932907104, "logps/chosen": -350.8500061035156, "logps/rejected": -407.5, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -0.599291980266571, "rewards/margins": 7.489062309265137, "rewards/rejected": -8.081250190734863, "step": 7730 }, { "epoch": 2.0400632577754347, "grad_norm": 6.851656795734662, "learning_rate": 4.900500790722192e-07, "logits/chosen": -0.28776854276657104, "logits/rejected": -0.2506957948207855, "logps/chosen": -361.0, "logps/rejected": -414.29998779296875, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": -1.0330810546875, "rewards/margins": 6.839062690734863, "rewards/rejected": -7.868750095367432, "step": 7740 }, { "epoch": 2.0426989984185555, "grad_norm": 19.343749073705638, "learning_rate": 4.893911439114391e-07, "logits/chosen": -0.3558105528354645, "logits/rejected": -0.47493284940719604, "logps/chosen": -398.95001220703125, "logps/rejected": -450.1000061035156, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.47895509004592896, "rewards/margins": 7.839062690734863, "rewards/rejected": -8.317187309265137, "step": 7750 }, { "epoch": 2.0453347390616763, "grad_norm": 1.4919885211145858, "learning_rate": 4.887322087506589e-07, "logits/chosen": -0.05199585109949112, "logits/rejected": -0.41257935762405396, "logps/chosen": -398.1000061035156, "logps/rejected": -411.70001220703125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -0.7419677972793579, "rewards/margins": 7.862500190734863, "rewards/rejected": -8.610937118530273, "step": 7760 }, { "epoch": 2.047970479704797, "grad_norm": 1.9922765760723171, "learning_rate": 4.880732735898787e-07, "logits/chosen": -0.25017088651657104, "logits/rejected": -0.49321287870407104, "logps/chosen": -381.04998779296875, "logps/rejected": -424.1499938964844, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -0.964599609375, "rewards/margins": 7.889062404632568, "rewards/rejected": -8.848437309265137, "step": 7770 }, { "epoch": 2.050606220347918, "grad_norm": 14.648333546939531, "learning_rate": 4.874143384290985e-07, "logits/chosen": -0.21605834364891052, "logits/rejected": -0.4852050840854645, "logps/chosen": -373.1499938964844, "logps/rejected": -440.0, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -0.705126941204071, "rewards/margins": 7.446875095367432, "rewards/rejected": -8.1484375, "step": 7780 }, { "epoch": 2.0532419609910386, "grad_norm": 2.4379545289810944, "learning_rate": 4.867554032683184e-07, "logits/chosen": -0.2358711212873459, "logits/rejected": -0.5773681402206421, "logps/chosen": -363.1499938964844, "logps/rejected": -408.29998779296875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.730151355266571, "rewards/margins": 8.203125, "rewards/rejected": -8.931249618530273, "step": 7790 }, { "epoch": 2.0558777016341594, "grad_norm": 1.0043835434827662, "learning_rate": 4.860964681075382e-07, "logits/chosen": -0.27912598848342896, "logits/rejected": -0.5451599359512329, "logps/chosen": -372.29998779296875, "logps/rejected": -433.29998779296875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.539501965045929, "rewards/margins": 7.973437309265137, "rewards/rejected": -8.506250381469727, "step": 7800 }, { "epoch": 2.0585134422772797, "grad_norm": 0.9790230732938776, "learning_rate": 4.854375329467581e-07, "logits/chosen": -0.35633546113967896, "logits/rejected": -0.664111316204071, "logps/chosen": -391.20001220703125, "logps/rejected": -387.04998779296875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.4071044921875, "rewards/margins": 7.654687404632568, "rewards/rejected": -8.0546875, "step": 7810 }, { "epoch": 2.0611491829204005, "grad_norm": 53.8588199186574, "learning_rate": 4.847785977859778e-07, "logits/chosen": -0.2865234315395355, "logits/rejected": -0.4794677793979645, "logps/chosen": -348.79998779296875, "logps/rejected": -440.1000061035156, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2255859375, "rewards/margins": 7.956250190734863, "rewards/rejected": -9.182812690734863, "step": 7820 }, { "epoch": 2.0637849235635213, "grad_norm": 4.218976891009878, "learning_rate": 4.841196626251977e-07, "logits/chosen": -0.38873291015625, "logits/rejected": -0.607617199420929, "logps/chosen": -401.54998779296875, "logps/rejected": -450.5, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -0.9362548589706421, "rewards/margins": 7.834374904632568, "rewards/rejected": -8.759374618530273, "step": 7830 }, { "epoch": 2.066420664206642, "grad_norm": 11.813473363144844, "learning_rate": 4.834607274644174e-07, "logits/chosen": -0.3989807069301605, "logits/rejected": -0.562207043170929, "logps/chosen": -353.04998779296875, "logps/rejected": -383.0, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7509521245956421, "rewards/margins": 7.703125, "rewards/rejected": -8.451562881469727, "step": 7840 }, { "epoch": 2.069056404849763, "grad_norm": 7.715112835681503, "learning_rate": 4.828017923036373e-07, "logits/chosen": -0.33624267578125, "logits/rejected": -0.4302734434604645, "logps/chosen": -381.20001220703125, "logps/rejected": -467.6000061035156, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.78076171875, "rewards/margins": 7.901562690734863, "rewards/rejected": -8.676562309265137, "step": 7850 }, { "epoch": 2.0716921454928836, "grad_norm": 1.3788176857366963, "learning_rate": 4.821428571428571e-07, "logits/chosen": -0.08834533393383026, "logits/rejected": -0.3973754942417145, "logps/chosen": -392.1000061035156, "logps/rejected": -434.0, "loss": 0.0184, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.19977417588233948, "rewards/margins": 7.667187690734863, "rewards/rejected": -7.46875, "step": 7860 }, { "epoch": 2.0743278861360044, "grad_norm": 9.765288018002954, "learning_rate": 4.81483921982077e-07, "logits/chosen": -0.19979247450828552, "logits/rejected": -0.37946778535842896, "logps/chosen": -367.79998779296875, "logps/rejected": -424.0, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -0.24205322563648224, "rewards/margins": 7.848437309265137, "rewards/rejected": -8.089062690734863, "step": 7870 }, { "epoch": 2.0769636267791247, "grad_norm": 8.118243583281853, "learning_rate": 4.808249868212967e-07, "logits/chosen": -0.17474365234375, "logits/rejected": -0.5517578125, "logps/chosen": -362.6499938964844, "logps/rejected": -377.0, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.025488257408142, "rewards/margins": 7.392187595367432, "rewards/rejected": -8.415624618530273, "step": 7880 }, { "epoch": 2.0795993674222455, "grad_norm": 10.495081092185284, "learning_rate": 4.801660516605166e-07, "logits/chosen": -0.24091187119483948, "logits/rejected": -0.4225219786167145, "logps/chosen": -422.0, "logps/rejected": -434.6000061035156, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -0.7437804937362671, "rewards/margins": 7.287499904632568, "rewards/rejected": -8.037500381469727, "step": 7890 }, { "epoch": 2.0822351080653663, "grad_norm": 1.593465663182507, "learning_rate": 4.795071164997364e-07, "logits/chosen": -0.33369141817092896, "logits/rejected": -0.5452880859375, "logps/chosen": -407.5, "logps/rejected": -443.8500061035156, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.624157726764679, "rewards/margins": 8.279687881469727, "rewards/rejected": -8.901562690734863, "step": 7900 }, { "epoch": 2.084870848708487, "grad_norm": 0.9770260260768143, "learning_rate": 4.788481813389562e-07, "logits/chosen": -0.32447510957717896, "logits/rejected": -0.571826159954071, "logps/chosen": -343.7250061035156, "logps/rejected": -408.70001220703125, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.279296875, "rewards/margins": 7.801562309265137, "rewards/rejected": -9.073437690734863, "step": 7910 }, { "epoch": 2.087506589351608, "grad_norm": 4.186001556463956, "learning_rate": 4.78189246178176e-07, "logits/chosen": -0.07771758735179901, "logits/rejected": -0.44575804471969604, "logps/chosen": -363.3999938964844, "logps/rejected": -421.3999938964844, "loss": 0.0282, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1514403820037842, "rewards/margins": 7.684374809265137, "rewards/rejected": -8.832812309265137, "step": 7920 }, { "epoch": 2.0901423299947286, "grad_norm": 3.4396958467938434, "learning_rate": 4.775303110173959e-07, "logits/chosen": -0.23808594048023224, "logits/rejected": -0.567187488079071, "logps/chosen": -395.79998779296875, "logps/rejected": -441.8999938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.133569359779358, "rewards/margins": 8.034375190734863, "rewards/rejected": -9.1640625, "step": 7930 }, { "epoch": 2.0927780706378494, "grad_norm": 0.8595731253813993, "learning_rate": 4.768713758566157e-07, "logits/chosen": -0.15927734971046448, "logits/rejected": -0.598461925983429, "logps/chosen": -429.0, "logps/rejected": -451.6000061035156, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.0591919422149658, "rewards/margins": 8.456250190734863, "rewards/rejected": -9.515625, "step": 7940 }, { "epoch": 2.09541381128097, "grad_norm": 0.17071348846196147, "learning_rate": 4.762124406958355e-07, "logits/chosen": -0.22700195014476776, "logits/rejected": -0.4549560546875, "logps/chosen": -398.54998779296875, "logps/rejected": -456.3999938964844, "loss": 0.0444, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.59619140625, "rewards/margins": 8.532812118530273, "rewards/rejected": -10.120312690734863, "step": 7950 }, { "epoch": 2.0980495519240905, "grad_norm": 42.02950929243473, "learning_rate": 4.7555350553505536e-07, "logits/chosen": -0.3731750547885895, "logits/rejected": -0.6761230230331421, "logps/chosen": -427.6000061035156, "logps/rejected": -466.1000061035156, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.905175805091858, "rewards/margins": 8.229687690734863, "rewards/rejected": -10.129687309265137, "step": 7960 }, { "epoch": 2.1006852925672113, "grad_norm": 0.7550455500126316, "learning_rate": 4.7489457037427516e-07, "logits/chosen": -0.17879638075828552, "logits/rejected": -0.554003894329071, "logps/chosen": -452.20001220703125, "logps/rejected": -467.79998779296875, "loss": 0.0126, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.305639624595642, "rewards/margins": 8.432812690734863, "rewards/rejected": -9.735937118530273, "step": 7970 }, { "epoch": 2.103321033210332, "grad_norm": 1.0820079245185958, "learning_rate": 4.7423563521349496e-07, "logits/chosen": -0.4032348692417145, "logits/rejected": -0.520263671875, "logps/chosen": -396.3999938964844, "logps/rejected": -410.70001220703125, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.326928734779358, "rewards/margins": 8.087499618530273, "rewards/rejected": -9.4140625, "step": 7980 }, { "epoch": 2.105956773853453, "grad_norm": 2.797779142015728, "learning_rate": 4.7357670005271475e-07, "logits/chosen": -0.18572998046875, "logits/rejected": -0.45991820096969604, "logps/chosen": -383.1499938964844, "logps/rejected": -441.8999938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.8765624761581421, "rewards/margins": 7.782812595367432, "rewards/rejected": -8.662500381469727, "step": 7990 }, { "epoch": 2.1085925144965736, "grad_norm": 2.6726321298566504, "learning_rate": 4.7291776489193466e-07, "logits/chosen": -0.12401123344898224, "logits/rejected": -0.3377441465854645, "logps/chosen": -434.20001220703125, "logps/rejected": -453.0, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -0.6109985113143921, "rewards/margins": 7.646874904632568, "rewards/rejected": -8.2578125, "step": 8000 }, { "epoch": 2.1112282551396944, "grad_norm": 33.88162362046839, "learning_rate": 4.7225882973115445e-07, "logits/chosen": -0.11463622748851776, "logits/rejected": -0.36064988374710083, "logps/chosen": -357.04998779296875, "logps/rejected": -426.1000061035156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -1.073510766029358, "rewards/margins": 7.675000190734863, "rewards/rejected": -8.751562118530273, "step": 8010 }, { "epoch": 2.113863995782815, "grad_norm": 6.054900942826667, "learning_rate": 4.7159989457037425e-07, "logits/chosen": -0.09710693359375, "logits/rejected": -0.36835938692092896, "logps/chosen": -369.1000061035156, "logps/rejected": -437.3999938964844, "loss": 0.0166, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.877685546875, "rewards/margins": 7.974999904632568, "rewards/rejected": -8.854687690734863, "step": 8020 }, { "epoch": 2.1164997364259355, "grad_norm": 1.046277153774691, "learning_rate": 4.7094095940959405e-07, "logits/chosen": -0.203155517578125, "logits/rejected": -0.34898680448532104, "logps/chosen": -378.75, "logps/rejected": -475.0, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.615576148033142, "rewards/margins": 7.965624809265137, "rewards/rejected": -9.576562881469727, "step": 8030 }, { "epoch": 2.1191354770690563, "grad_norm": 11.101021140929465, "learning_rate": 4.702820242488139e-07, "logits/chosen": -0.10382080078125, "logits/rejected": -0.49755859375, "logps/chosen": -381.70001220703125, "logps/rejected": -426.8999938964844, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -1.11865234375, "rewards/margins": 7.915625095367432, "rewards/rejected": -9.0390625, "step": 8040 }, { "epoch": 2.121771217712177, "grad_norm": 1.6714650107921318, "learning_rate": 4.696230890880337e-07, "logits/chosen": -0.11708984524011612, "logits/rejected": -0.483795166015625, "logps/chosen": -435.70001220703125, "logps/rejected": -465.5, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.1593749523162842, "rewards/margins": 7.971875190734863, "rewards/rejected": -9.1328125, "step": 8050 }, { "epoch": 2.124406958355298, "grad_norm": 3.559610813994201, "learning_rate": 4.6896415392725355e-07, "logits/chosen": -0.26811522245407104, "logits/rejected": -0.617724597454071, "logps/chosen": -367.25, "logps/rejected": -413.5, "loss": 0.0147, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8776611089706421, "rewards/margins": 8.0, "rewards/rejected": -8.873437881469727, "step": 8060 }, { "epoch": 2.1270426989984186, "grad_norm": 3.7730297950009906, "learning_rate": 4.6830521876647334e-07, "logits/chosen": -0.14492186903953552, "logits/rejected": -0.427581787109375, "logps/chosen": -419.25, "logps/rejected": -447.8999938964844, "loss": 0.0131, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7380859851837158, "rewards/margins": 8.037500381469727, "rewards/rejected": -9.779687881469727, "step": 8070 }, { "epoch": 2.1296784396415394, "grad_norm": 8.835260488141193, "learning_rate": 4.676462836056932e-07, "logits/chosen": -0.11387939751148224, "logits/rejected": -0.482177734375, "logps/chosen": -377.0, "logps/rejected": -430.6000061035156, "loss": 0.0221, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2195312976837158, "rewards/margins": 7.8125, "rewards/rejected": -9.0234375, "step": 8080 }, { "epoch": 2.13231418028466, "grad_norm": 2.3066741555155756, "learning_rate": 4.66987348444913e-07, "logits/chosen": 0.0035156249068677425, "logits/rejected": -0.41175538301467896, "logps/chosen": -365.6499938964844, "logps/rejected": -443.70001220703125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.972552478313446, "rewards/margins": 7.629687309265137, "rewards/rejected": -8.604687690734863, "step": 8090 }, { "epoch": 2.134949920927781, "grad_norm": 3.7536390180316186, "learning_rate": 4.663284132841328e-07, "logits/chosen": -0.05616455152630806, "logits/rejected": -0.37822264432907104, "logps/chosen": -339.54998779296875, "logps/rejected": -379.6499938964844, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -1.057519555091858, "rewards/margins": 7.5078125, "rewards/rejected": -8.564062118530273, "step": 8100 }, { "epoch": 2.1375856615709012, "grad_norm": 14.67802449639903, "learning_rate": 4.6566947812335264e-07, "logits/chosen": -0.05077514797449112, "logits/rejected": -0.4020752012729645, "logps/chosen": -381.45001220703125, "logps/rejected": -422.8999938964844, "loss": 0.0172, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3569824695587158, "rewards/margins": 7.237500190734863, "rewards/rejected": -8.600000381469727, "step": 8110 }, { "epoch": 2.140221402214022, "grad_norm": 6.916509061436545, "learning_rate": 4.650105429625725e-07, "logits/chosen": -0.25019532442092896, "logits/rejected": -0.541430652141571, "logps/chosen": -389.25, "logps/rejected": -374.75, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -1.168115258216858, "rewards/margins": 7.418749809265137, "rewards/rejected": -8.587499618530273, "step": 8120 }, { "epoch": 2.142857142857143, "grad_norm": 22.076786910088476, "learning_rate": 4.643516078017923e-07, "logits/chosen": -0.2860656678676605, "logits/rejected": -0.5229126214981079, "logps/chosen": -356.8500061035156, "logps/rejected": -408.5, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7857421636581421, "rewards/margins": 7.7421875, "rewards/rejected": -8.517187118530273, "step": 8130 }, { "epoch": 2.1454928835002636, "grad_norm": 6.475825130790931, "learning_rate": 4.636926726410121e-07, "logits/chosen": -0.12540283799171448, "logits/rejected": -0.3233093321323395, "logps/chosen": -397.3999938964844, "logps/rejected": -444.75, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -0.7879394292831421, "rewards/margins": 7.282812595367432, "rewards/rejected": -8.067187309265137, "step": 8140 }, { "epoch": 2.1481286241433843, "grad_norm": 0.9405037884535301, "learning_rate": 4.6303373748023193e-07, "logits/chosen": 0.0003784179571084678, "logits/rejected": -0.44050294160842896, "logps/chosen": -431.8999938964844, "logps/rejected": -423.1000061035156, "loss": 0.0146, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.589404284954071, "rewards/margins": 7.412499904632568, "rewards/rejected": -8.004687309265137, "step": 8150 }, { "epoch": 2.150764364786505, "grad_norm": 1.2907591139562928, "learning_rate": 4.6237480231945173e-07, "logits/chosen": -0.0311279296875, "logits/rejected": -0.44038087129592896, "logps/chosen": -415.1000061035156, "logps/rejected": -408.70001220703125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -0.6792358160018921, "rewards/margins": 8.434374809265137, "rewards/rejected": -9.114062309265137, "step": 8160 }, { "epoch": 2.153400105429626, "grad_norm": 5.8218792409771645, "learning_rate": 4.617158671586716e-07, "logits/chosen": -0.24169921875, "logits/rejected": -0.532763659954071, "logps/chosen": -407.25, "logps/rejected": -437.6000061035156, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.112207055091858, "rewards/margins": 7.995312690734863, "rewards/rejected": -9.1015625, "step": 8170 }, { "epoch": 2.1560358460727462, "grad_norm": 95.52621222947533, "learning_rate": 4.610569319978914e-07, "logits/chosen": -0.28334349393844604, "logits/rejected": -0.43378907442092896, "logps/chosen": -369.20001220703125, "logps/rejected": -402.6000061035156, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.007959008216858, "rewards/margins": 7.592187404632568, "rewards/rejected": -8.604687690734863, "step": 8180 }, { "epoch": 2.158671586715867, "grad_norm": 4.872736708231355, "learning_rate": 4.6039799683711123e-07, "logits/chosen": -0.16314086318016052, "logits/rejected": -0.527539074420929, "logps/chosen": -379.3500061035156, "logps/rejected": -439.29998779296875, "loss": 0.0145, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.685791015625, "rewards/margins": 7.982812404632568, "rewards/rejected": -9.6640625, "step": 8190 }, { "epoch": 2.161307327358988, "grad_norm": 15.809378032931164, "learning_rate": 4.5973906167633103e-07, "logits/chosen": -0.29036253690719604, "logits/rejected": -0.5408538579940796, "logps/chosen": -338.4750061035156, "logps/rejected": -397.75, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.158544898033142, "rewards/margins": 7.479687690734863, "rewards/rejected": -8.640625, "step": 8200 }, { "epoch": 2.1639430680021086, "grad_norm": 34.130475928723364, "learning_rate": 4.590801265155508e-07, "logits/chosen": -0.36467283964157104, "logits/rejected": -0.5821533203125, "logps/chosen": -407.0, "logps/rejected": -424.70001220703125, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.7859375476837158, "rewards/margins": 7.6953125, "rewards/rejected": -9.481249809265137, "step": 8210 }, { "epoch": 2.1665788086452293, "grad_norm": 2.549717898763933, "learning_rate": 4.584211913547706e-07, "logits/chosen": -0.3342041075229645, "logits/rejected": -0.6158202886581421, "logps/chosen": -353.1000061035156, "logps/rejected": -399.6000061035156, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.32135009765625, "rewards/margins": 8.246874809265137, "rewards/rejected": -9.556249618530273, "step": 8220 }, { "epoch": 2.16921454928835, "grad_norm": 5.2452408593458495, "learning_rate": 4.577622561939905e-07, "logits/chosen": -0.22272948920726776, "logits/rejected": -0.5365234613418579, "logps/chosen": -423.3999938964844, "logps/rejected": -427.29998779296875, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.476464867591858, "rewards/margins": 8.167187690734863, "rewards/rejected": -9.640625, "step": 8230 }, { "epoch": 2.171850289931471, "grad_norm": 17.997970439073598, "learning_rate": 4.571033210332103e-07, "logits/chosen": -0.17955322563648224, "logits/rejected": -0.4689529538154602, "logps/chosen": -391.6000061035156, "logps/rejected": -450.3999938964844, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.945892333984375, "rewards/margins": 8.534375190734863, "rewards/rejected": -9.479687690734863, "step": 8240 }, { "epoch": 2.1744860305745917, "grad_norm": 5.82560863514099, "learning_rate": 4.564443858724301e-07, "logits/chosen": -0.49199217557907104, "logits/rejected": -0.602832019329071, "logps/chosen": -369.29998779296875, "logps/rejected": -424.95001220703125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.439306616783142, "rewards/margins": 8.154687881469727, "rewards/rejected": -9.5859375, "step": 8250 }, { "epoch": 2.177121771217712, "grad_norm": 1.641845005489589, "learning_rate": 4.5578545071164997e-07, "logits/chosen": -0.28660279512405396, "logits/rejected": -0.634350597858429, "logps/chosen": -372.1499938964844, "logps/rejected": -407.20001220703125, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9972168207168579, "rewards/margins": 7.9609375, "rewards/rejected": -8.964062690734863, "step": 8260 }, { "epoch": 2.1797575118608328, "grad_norm": 4.658135740326557, "learning_rate": 4.5512651555086977e-07, "logits/chosen": -0.05078125, "logits/rejected": -0.4454711973667145, "logps/chosen": -384.3999938964844, "logps/rejected": -429.8500061035156, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -1.108300805091858, "rewards/margins": 7.807812690734863, "rewards/rejected": -8.910937309265137, "step": 8270 }, { "epoch": 2.1823932525039536, "grad_norm": 6.112228205968455, "learning_rate": 4.5446758039008956e-07, "logits/chosen": -0.42256468534469604, "logits/rejected": -0.547900378704071, "logps/chosen": -375.0, "logps/rejected": -398.25, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.458984375, "rewards/margins": 7.567187309265137, "rewards/rejected": -9.026562690734863, "step": 8280 }, { "epoch": 2.1850289931470743, "grad_norm": 31.0983237177889, "learning_rate": 4.538086452293094e-07, "logits/chosen": -0.21647949516773224, "logits/rejected": -0.649884045124054, "logps/chosen": -365.29998779296875, "logps/rejected": -407.79998779296875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -1.456640601158142, "rewards/margins": 7.756249904632568, "rewards/rejected": -9.2109375, "step": 8290 }, { "epoch": 2.187664733790195, "grad_norm": 2.617427389053542, "learning_rate": 4.5314971006852926e-07, "logits/chosen": -0.2554687559604645, "logits/rejected": -0.519335925579071, "logps/chosen": -322.1000061035156, "logps/rejected": -416.5, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.317041039466858, "rewards/margins": 7.895312309265137, "rewards/rejected": -9.201562881469727, "step": 8300 }, { "epoch": 2.190300474433316, "grad_norm": 22.656540780472742, "learning_rate": 4.5249077490774906e-07, "logits/chosen": -0.1329345703125, "logits/rejected": -0.3530639708042145, "logps/chosen": -422.75, "logps/rejected": -492.0, "loss": 0.0274, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1836059093475342, "rewards/margins": 8.253125190734863, "rewards/rejected": -9.434374809265137, "step": 8310 }, { "epoch": 2.1929362150764367, "grad_norm": 12.942991182589525, "learning_rate": 4.5183183974696886e-07, "logits/chosen": -0.32874831557273865, "logits/rejected": -0.5111938714981079, "logps/chosen": -382.6000061035156, "logps/rejected": -456.20001220703125, "loss": 0.0196, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9736328125, "rewards/margins": 7.707812309265137, "rewards/rejected": -9.692187309265137, "step": 8320 }, { "epoch": 2.195571955719557, "grad_norm": 2.5370343258915256, "learning_rate": 4.5117290458618866e-07, "logits/chosen": -0.40587157011032104, "logits/rejected": -0.68505859375, "logps/chosen": -421.70001220703125, "logps/rejected": -450.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.2038085460662842, "rewards/margins": 8.604687690734863, "rewards/rejected": -9.800000190734863, "step": 8330 }, { "epoch": 2.1982076963626778, "grad_norm": 21.925337447402363, "learning_rate": 4.5051396942540856e-07, "logits/chosen": -0.31425780057907104, "logits/rejected": -0.5526367425918579, "logps/chosen": -411.6499938964844, "logps/rejected": -448.79998779296875, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.675286889076233, "rewards/margins": 7.7265625, "rewards/rejected": -9.409375190734863, "step": 8340 }, { "epoch": 2.2008434370057985, "grad_norm": 25.568809144439083, "learning_rate": 4.4985503426462836e-07, "logits/chosen": -0.16243895888328552, "logits/rejected": -0.6385742425918579, "logps/chosen": -394.6499938964844, "logps/rejected": -429.6499938964844, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.1161682605743408, "rewards/margins": 8.423437118530273, "rewards/rejected": -9.542187690734863, "step": 8350 }, { "epoch": 2.2034791776489193, "grad_norm": 10.854099435680446, "learning_rate": 4.4919609910384815e-07, "logits/chosen": -0.3678955137729645, "logits/rejected": -0.558178722858429, "logps/chosen": -362.1000061035156, "logps/rejected": -450.25, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -1.068115234375, "rewards/margins": 7.734375, "rewards/rejected": -8.800000190734863, "step": 8360 }, { "epoch": 2.20611491829204, "grad_norm": 1.4248796354815634, "learning_rate": 4.48537163943068e-07, "logits/chosen": -0.3278564512729645, "logits/rejected": -0.2982849180698395, "logps/chosen": -350.8500061035156, "logps/rejected": -445.8999938964844, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -0.4690185487270355, "rewards/margins": 8.003125190734863, "rewards/rejected": -8.475000381469727, "step": 8370 }, { "epoch": 2.208750658935161, "grad_norm": 21.48166853674235, "learning_rate": 4.478782287822878e-07, "logits/chosen": -0.27424925565719604, "logits/rejected": -0.5956665277481079, "logps/chosen": -388.04998779296875, "logps/rejected": -457.25, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.962719738483429, "rewards/margins": 8.342187881469727, "rewards/rejected": -9.306249618530273, "step": 8380 }, { "epoch": 2.2113863995782816, "grad_norm": 9.60369588044057, "learning_rate": 4.472192936215076e-07, "logits/chosen": -0.39304810762405396, "logits/rejected": -0.583020031452179, "logps/chosen": -361.3999938964844, "logps/rejected": -416.95001220703125, "loss": 0.0243, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.975299060344696, "rewards/margins": 8.301562309265137, "rewards/rejected": -9.274999618530273, "step": 8390 }, { "epoch": 2.2140221402214024, "grad_norm": 10.088296255915667, "learning_rate": 4.4656035846072745e-07, "logits/chosen": -0.29777222871780396, "logits/rejected": -0.5801757574081421, "logps/chosen": -394.6499938964844, "logps/rejected": -451.8999938964844, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.796215832233429, "rewards/margins": 8.201562881469727, "rewards/rejected": -8.990625381469727, "step": 8400 }, { "epoch": 2.2166578808645228, "grad_norm": 4.089623314403028, "learning_rate": 4.459014232999473e-07, "logits/chosen": -0.564013659954071, "logits/rejected": -0.5519775152206421, "logps/chosen": -377.3999938964844, "logps/rejected": -477.6000061035156, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.864453136920929, "rewards/margins": 7.998437404632568, "rewards/rejected": -8.871874809265137, "step": 8410 }, { "epoch": 2.2192936215076435, "grad_norm": 17.1953662099306, "learning_rate": 4.452424881391671e-07, "logits/chosen": -0.32831573486328125, "logits/rejected": -0.583544909954071, "logps/chosen": -410.04998779296875, "logps/rejected": -462.70001220703125, "loss": 0.0141, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2347900867462158, "rewards/margins": 8.379687309265137, "rewards/rejected": -9.610937118530273, "step": 8420 }, { "epoch": 2.2219293621507643, "grad_norm": 1.7615638623890035, "learning_rate": 4.445835529783869e-07, "logits/chosen": -0.5687255859375, "logits/rejected": -0.75811767578125, "logps/chosen": -378.8500061035156, "logps/rejected": -416.5, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.9455077648162842, "rewards/margins": 7.864062309265137, "rewards/rejected": -9.814062118530273, "step": 8430 }, { "epoch": 2.224565102793885, "grad_norm": 8.92840054520085, "learning_rate": 4.439246178176067e-07, "logits/chosen": -0.31293946504592896, "logits/rejected": -0.5700317621231079, "logps/chosen": -385.95001220703125, "logps/rejected": -463.79998779296875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.876708984375, "rewards/margins": 8.610937118530273, "rewards/rejected": -10.487500190734863, "step": 8440 }, { "epoch": 2.227200843437006, "grad_norm": 3.49707754097767, "learning_rate": 4.4326568265682654e-07, "logits/chosen": -0.436178594827652, "logits/rejected": -0.6549316644668579, "logps/chosen": -399.8500061035156, "logps/rejected": -398.45001220703125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.25482177734375, "rewards/margins": 8.199999809265137, "rewards/rejected": -9.446874618530273, "step": 8450 }, { "epoch": 2.2298365840801266, "grad_norm": 5.944742572499458, "learning_rate": 4.426067474960464e-07, "logits/chosen": -0.3897949159145355, "logits/rejected": -0.47755128145217896, "logps/chosen": -383.6000061035156, "logps/rejected": -425.3999938964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.3811523914337158, "rewards/margins": 8.004687309265137, "rewards/rejected": -9.393750190734863, "step": 8460 }, { "epoch": 2.2324723247232474, "grad_norm": 4.618454096146627, "learning_rate": 4.419478123352662e-07, "logits/chosen": -0.32386475801467896, "logits/rejected": -0.5047241449356079, "logps/chosen": -398.79998779296875, "logps/rejected": -435.6000061035156, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.956298828125, "rewards/margins": 7.795312404632568, "rewards/rejected": -8.753125190734863, "step": 8470 }, { "epoch": 2.2351080653663677, "grad_norm": 16.10012478332943, "learning_rate": 4.41288877174486e-07, "logits/chosen": -0.2864013612270355, "logits/rejected": -0.5868896245956421, "logps/chosen": -391.6000061035156, "logps/rejected": -419.04998779296875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -1.3852050304412842, "rewards/margins": 7.59375, "rewards/rejected": -8.973437309265137, "step": 8480 }, { "epoch": 2.2377438060094885, "grad_norm": 4.734552383113841, "learning_rate": 4.4062994201370584e-07, "logits/chosen": -0.35112303495407104, "logits/rejected": -0.651196300983429, "logps/chosen": -342.25, "logps/rejected": -385.3999938964844, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.167236328125, "rewards/margins": 7.668749809265137, "rewards/rejected": -8.839062690734863, "step": 8490 }, { "epoch": 2.2403795466526093, "grad_norm": 0.9904105003968097, "learning_rate": 4.3997100685292563e-07, "logits/chosen": -0.49104005098342896, "logits/rejected": -0.72412109375, "logps/chosen": -405.70001220703125, "logps/rejected": -412.3999938964844, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -1.223535180091858, "rewards/margins": 7.965624809265137, "rewards/rejected": -9.203125, "step": 8500 }, { "epoch": 2.24301528729573, "grad_norm": 6.939474086513625, "learning_rate": 4.393120716921455e-07, "logits/chosen": -0.3934570252895355, "logits/rejected": -0.5060790777206421, "logps/chosen": -418.6499938964844, "logps/rejected": -497.20001220703125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.2224609851837158, "rewards/margins": 8.603124618530273, "rewards/rejected": -9.831250190734863, "step": 8510 }, { "epoch": 2.245651027938851, "grad_norm": 4.2219706851077445, "learning_rate": 4.3865313653136533e-07, "logits/chosen": -0.45435792207717896, "logits/rejected": -0.614880383014679, "logps/chosen": -371.25, "logps/rejected": -413.5, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.67218017578125, "rewards/margins": 8.167187690734863, "rewards/rejected": -9.842187881469727, "step": 8520 }, { "epoch": 2.2482867685819716, "grad_norm": 1.7790205746420364, "learning_rate": 4.3799420137058513e-07, "logits/chosen": -0.5192016363143921, "logits/rejected": -0.7398926019668579, "logps/chosen": -406.8500061035156, "logps/rejected": -456.5, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.8228027820587158, "rewards/margins": 8.171875, "rewards/rejected": -9.990625381469727, "step": 8530 }, { "epoch": 2.2509225092250924, "grad_norm": 6.10983142741802, "learning_rate": 4.3733526620980493e-07, "logits/chosen": -0.45930176973342896, "logits/rejected": -0.6492675542831421, "logps/chosen": -394.70001220703125, "logps/rejected": -472.0, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -1.564965844154358, "rewards/margins": 8.399999618530273, "rewards/rejected": -9.964062690734863, "step": 8540 }, { "epoch": 2.253558249868213, "grad_norm": 18.94061964679294, "learning_rate": 4.3667633104902473e-07, "logits/chosen": -0.2578369081020355, "logits/rejected": -0.823046863079071, "logps/chosen": -389.3500061035156, "logps/rejected": -422.0, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7594726085662842, "rewards/margins": 8.256250381469727, "rewards/rejected": -10.0078125, "step": 8550 }, { "epoch": 2.2561939905113335, "grad_norm": 47.239538276008346, "learning_rate": 4.360173958882446e-07, "logits/chosen": -0.4512878358364105, "logits/rejected": -0.5230957269668579, "logps/chosen": -341.1000061035156, "logps/rejected": -416.3999938964844, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -1.2631347179412842, "rewards/margins": 7.775000095367432, "rewards/rejected": -9.046875, "step": 8560 }, { "epoch": 2.2588297311544543, "grad_norm": 6.528067146169544, "learning_rate": 4.3535846072746443e-07, "logits/chosen": -0.56787109375, "logits/rejected": -0.6281982660293579, "logps/chosen": -375.1000061035156, "logps/rejected": -440.1000061035156, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.973583996295929, "rewards/margins": 8.162500381469727, "rewards/rejected": -9.128125190734863, "step": 8570 }, { "epoch": 2.261465471797575, "grad_norm": 0.615843929663593, "learning_rate": 4.346995255666842e-07, "logits/chosen": -0.307089239358902, "logits/rejected": -0.4875732362270355, "logps/chosen": -400.6000061035156, "logps/rejected": -465.20001220703125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.781445324420929, "rewards/margins": 8.139062881469727, "rewards/rejected": -8.928125381469727, "step": 8580 }, { "epoch": 2.264101212440696, "grad_norm": 13.48551494315514, "learning_rate": 4.34040590405904e-07, "logits/chosen": -0.2894287109375, "logits/rejected": -0.602587878704071, "logps/chosen": -393.3500061035156, "logps/rejected": -405.54998779296875, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -1.2451171875, "rewards/margins": 7.4375, "rewards/rejected": -8.6796875, "step": 8590 }, { "epoch": 2.2667369530838166, "grad_norm": 1.1687297553191518, "learning_rate": 4.3338165524512387e-07, "logits/chosen": -0.44921875, "logits/rejected": -0.7051498293876648, "logps/chosen": -376.75, "logps/rejected": -430.1000061035156, "loss": 0.0198, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5403320789337158, "rewards/margins": 8.217187881469727, "rewards/rejected": -9.753125190734863, "step": 8600 }, { "epoch": 2.2693726937269374, "grad_norm": 8.453749715968796, "learning_rate": 4.3272272008434367e-07, "logits/chosen": -0.17275390028953552, "logits/rejected": -0.48090821504592896, "logps/chosen": -412.70001220703125, "logps/rejected": -476.79998779296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.862890601158142, "rewards/margins": 8.807812690734863, "rewards/rejected": -10.6875, "step": 8610 }, { "epoch": 2.272008434370058, "grad_norm": 23.447774544865013, "learning_rate": 4.3206378492356347e-07, "logits/chosen": -0.3612304627895355, "logits/rejected": -0.6440185308456421, "logps/chosen": -362.04998779296875, "logps/rejected": -427.1000061035156, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.364941358566284, "rewards/margins": 7.990624904632568, "rewards/rejected": -10.3515625, "step": 8620 }, { "epoch": 2.2746441750131785, "grad_norm": 0.6399786417252178, "learning_rate": 4.314048497627833e-07, "logits/chosen": -0.3782699704170227, "logits/rejected": -0.6309570074081421, "logps/chosen": -397.25, "logps/rejected": -437.3999938964844, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -1.7947266101837158, "rewards/margins": 8.276562690734863, "rewards/rejected": -10.073437690734863, "step": 8630 }, { "epoch": 2.2772799156562993, "grad_norm": 16.147739064072354, "learning_rate": 4.3074591460200317e-07, "logits/chosen": -0.25041502714157104, "logits/rejected": -0.5210937261581421, "logps/chosen": -398.29998779296875, "logps/rejected": -456.79998779296875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.632226586341858, "rewards/margins": 8.565625190734863, "rewards/rejected": -10.199999809265137, "step": 8640 }, { "epoch": 2.27991565629942, "grad_norm": 4.154086826767632, "learning_rate": 4.3008697944122296e-07, "logits/chosen": -0.31297606229782104, "logits/rejected": -0.45185548067092896, "logps/chosen": -384.79998779296875, "logps/rejected": -446.8999938964844, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -1.9601562023162842, "rewards/margins": 7.965624809265137, "rewards/rejected": -9.9375, "step": 8650 }, { "epoch": 2.282551396942541, "grad_norm": 2.5390351453483637, "learning_rate": 4.2942804428044276e-07, "logits/chosen": -0.18803100287914276, "logits/rejected": -0.4834960997104645, "logps/chosen": -359.79998779296875, "logps/rejected": -420.8999938964844, "loss": 0.0175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7545410394668579, "rewards/margins": 7.740624904632568, "rewards/rejected": -8.485937118530273, "step": 8660 }, { "epoch": 2.2851871375856616, "grad_norm": 7.912712699605937, "learning_rate": 4.287691091196626e-07, "logits/chosen": -0.19896240532398224, "logits/rejected": -0.49760740995407104, "logps/chosen": -389.54998779296875, "logps/rejected": -456.54998779296875, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -1.304174780845642, "rewards/margins": 7.779687404632568, "rewards/rejected": -9.081250190734863, "step": 8670 }, { "epoch": 2.2878228782287824, "grad_norm": 10.467901174724686, "learning_rate": 4.2811017395888246e-07, "logits/chosen": -0.2958129942417145, "logits/rejected": -0.6491607427597046, "logps/chosen": -400.6000061035156, "logps/rejected": -412.0, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.3779296875, "rewards/margins": 8.310937881469727, "rewards/rejected": -9.698437690734863, "step": 8680 }, { "epoch": 2.290458618871903, "grad_norm": 4.374375345064718, "learning_rate": 4.2745123879810226e-07, "logits/chosen": -0.3914428651332855, "logits/rejected": -0.601318359375, "logps/chosen": -382.04998779296875, "logps/rejected": -444.04998779296875, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -1.279541015625, "rewards/margins": 8.201562881469727, "rewards/rejected": -9.484375, "step": 8690 }, { "epoch": 2.293094359515024, "grad_norm": 19.401333035482153, "learning_rate": 4.2679230363732206e-07, "logits/chosen": -0.23280028998851776, "logits/rejected": -0.5367187261581421, "logps/chosen": -403.25, "logps/rejected": -429.5, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5347411632537842, "rewards/margins": 8.346875190734863, "rewards/rejected": -9.881250381469727, "step": 8700 }, { "epoch": 2.2957301001581443, "grad_norm": 6.699437230934632, "learning_rate": 4.261333684765419e-07, "logits/chosen": -0.35310059785842896, "logits/rejected": -0.49437254667282104, "logps/chosen": -334.95001220703125, "logps/rejected": -410.20001220703125, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -1.740820288658142, "rewards/margins": 8.079687118530273, "rewards/rejected": -9.823437690734863, "step": 8710 }, { "epoch": 2.298365840801265, "grad_norm": 2.222209769626922, "learning_rate": 4.254744333157617e-07, "logits/chosen": -0.3786377012729645, "logits/rejected": -0.5527984499931335, "logps/chosen": -397.95001220703125, "logps/rejected": -456.3500061035156, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.652441382408142, "rewards/margins": 8.587499618530273, "rewards/rejected": -10.235937118530273, "step": 8720 }, { "epoch": 2.301001581444386, "grad_norm": 5.384826155758691, "learning_rate": 4.248154981549815e-07, "logits/chosen": -0.23067016899585724, "logits/rejected": -0.5321899652481079, "logps/chosen": -370.5, "logps/rejected": -412.1499938964844, "loss": 0.0449, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1794922351837158, "rewards/margins": 8.065625190734863, "rewards/rejected": -9.2421875, "step": 8730 }, { "epoch": 2.3036373220875066, "grad_norm": 7.752760522934258, "learning_rate": 4.2415656299420135e-07, "logits/chosen": -0.2742919921875, "logits/rejected": -0.5368210077285767, "logps/chosen": -380.0, "logps/rejected": -429.0, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.9199798107147217, "rewards/margins": 8.107812881469727, "rewards/rejected": -10.0234375, "step": 8740 }, { "epoch": 2.3062730627306274, "grad_norm": 1.224725128958731, "learning_rate": 4.234976278334212e-07, "logits/chosen": -0.286630243062973, "logits/rejected": -0.588287353515625, "logps/chosen": -404.20001220703125, "logps/rejected": -411.95001220703125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -1.3810546398162842, "rewards/margins": 8.114062309265137, "rewards/rejected": -9.481249809265137, "step": 8750 }, { "epoch": 2.308908803373748, "grad_norm": 1.8639339253409997, "learning_rate": 4.22838692672641e-07, "logits/chosen": -0.16075439751148224, "logits/rejected": -0.542529284954071, "logps/chosen": -403.20001220703125, "logps/rejected": -428.1000061035156, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.661230444908142, "rewards/margins": 8.176562309265137, "rewards/rejected": -9.834375381469727, "step": 8760 }, { "epoch": 2.311544544016869, "grad_norm": 1.1063788486451827, "learning_rate": 4.221797575118608e-07, "logits/chosen": -0.22052612900733948, "logits/rejected": -0.4670654237270355, "logps/chosen": -352.79998779296875, "logps/rejected": -398.75, "loss": 0.0346, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5498778820037842, "rewards/margins": 7.8125, "rewards/rejected": -9.360937118530273, "step": 8770 }, { "epoch": 2.3141802846599893, "grad_norm": 17.067662489152486, "learning_rate": 4.215208223510806e-07, "logits/chosen": -0.31219482421875, "logits/rejected": -0.6252685785293579, "logps/chosen": -437.1000061035156, "logps/rejected": -430.70001220703125, "loss": 0.0178, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.61492919921875, "rewards/margins": 8.387499809265137, "rewards/rejected": -9.998437881469727, "step": 8780 }, { "epoch": 2.31681602530311, "grad_norm": 5.705228257814234, "learning_rate": 4.2086188719030044e-07, "logits/chosen": -0.3968505859375, "logits/rejected": -0.5931030511856079, "logps/chosen": -348.45001220703125, "logps/rejected": -404.3999938964844, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -1.9724609851837158, "rewards/margins": 7.6328125, "rewards/rejected": -9.610937118530273, "step": 8790 }, { "epoch": 2.319451765946231, "grad_norm": 3.06241313526824, "learning_rate": 4.202029520295203e-07, "logits/chosen": -0.24351806938648224, "logits/rejected": -0.612500011920929, "logps/chosen": -368.29998779296875, "logps/rejected": -416.04998779296875, "loss": 0.0162, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.685937523841858, "rewards/margins": 7.989062309265137, "rewards/rejected": -9.671875, "step": 8800 }, { "epoch": 2.3220875065893516, "grad_norm": 12.851697794592235, "learning_rate": 4.195440168687401e-07, "logits/chosen": -0.20289306342601776, "logits/rejected": -0.4381103515625, "logps/chosen": -382.5, "logps/rejected": -436.8500061035156, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.323339819908142, "rewards/margins": 8.307812690734863, "rewards/rejected": -9.637499809265137, "step": 8810 }, { "epoch": 2.3247232472324724, "grad_norm": 6.4286384447508444, "learning_rate": 4.1888508170795994e-07, "logits/chosen": -0.25947266817092896, "logits/rejected": -0.395761102437973, "logps/chosen": -386.3500061035156, "logps/rejected": -403.1000061035156, "loss": 0.0171, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.783496081829071, "rewards/margins": 7.435937404632568, "rewards/rejected": -8.217187881469727, "step": 8820 }, { "epoch": 2.327358987875593, "grad_norm": 9.018946307180494, "learning_rate": 4.1822614654717974e-07, "logits/chosen": -0.08500976860523224, "logits/rejected": -0.28999024629592896, "logps/chosen": -397.5, "logps/rejected": -426.04998779296875, "loss": 0.0236, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0859375, "rewards/margins": 7.556250095367432, "rewards/rejected": -8.645312309265137, "step": 8830 }, { "epoch": 2.329994728518714, "grad_norm": 1.4512163240285727, "learning_rate": 4.1756721138639954e-07, "logits/chosen": -0.28636473417282104, "logits/rejected": -0.56207275390625, "logps/chosen": -389.29998779296875, "logps/rejected": -401.1000061035156, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -1.2448241710662842, "rewards/margins": 7.931250095367432, "rewards/rejected": -9.1796875, "step": 8840 }, { "epoch": 2.3326304691618347, "grad_norm": 2.8175553250859218, "learning_rate": 4.1690827622561933e-07, "logits/chosen": -0.18615111708641052, "logits/rejected": -0.45184326171875, "logps/chosen": -313.0, "logps/rejected": -413.1000061035156, "loss": 0.0277, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2841308116912842, "rewards/margins": 8.035937309265137, "rewards/rejected": -9.317187309265137, "step": 8850 }, { "epoch": 2.335266209804955, "grad_norm": 6.543958872589983, "learning_rate": 4.1624934106483924e-07, "logits/chosen": -0.3428710997104645, "logits/rejected": -0.42363280057907104, "logps/chosen": -366.20001220703125, "logps/rejected": -482.5, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -1.639746069908142, "rewards/margins": 8.456250190734863, "rewards/rejected": -10.084375381469727, "step": 8860 }, { "epoch": 2.337901950448076, "grad_norm": 2.2627637712334203, "learning_rate": 4.1559040590405903e-07, "logits/chosen": -0.34064942598342896, "logits/rejected": -0.6556640863418579, "logps/chosen": -379.8999938964844, "logps/rejected": -433.95001220703125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -1.848046898841858, "rewards/margins": 8.0, "rewards/rejected": -9.848437309265137, "step": 8870 }, { "epoch": 2.3405376910911966, "grad_norm": 0.7301022272014804, "learning_rate": 4.1493147074327883e-07, "logits/chosen": -0.36787110567092896, "logits/rejected": -0.7283691167831421, "logps/chosen": -374.29998779296875, "logps/rejected": -395.20001220703125, "loss": 0.0234, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5493652820587158, "rewards/margins": 7.84375, "rewards/rejected": -9.393750190734863, "step": 8880 }, { "epoch": 2.3431734317343174, "grad_norm": 5.740725253090686, "learning_rate": 4.1427253558249863e-07, "logits/chosen": -0.22934570908546448, "logits/rejected": -0.5229736566543579, "logps/chosen": -396.45001220703125, "logps/rejected": -472.70001220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.120043992996216, "rewards/margins": 8.293749809265137, "rewards/rejected": -10.410937309265137, "step": 8890 }, { "epoch": 2.345809172377438, "grad_norm": 2.815061204734194, "learning_rate": 4.136136004217185e-07, "logits/chosen": -0.3052978515625, "logits/rejected": -0.556353747844696, "logps/chosen": -367.2749938964844, "logps/rejected": -399.8999938964844, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.7124512195587158, "rewards/margins": 8.128125190734863, "rewards/rejected": -9.834375381469727, "step": 8900 }, { "epoch": 2.348444913020559, "grad_norm": 12.668019354853223, "learning_rate": 4.1295466526093833e-07, "logits/chosen": -0.274658203125, "logits/rejected": -0.528613269329071, "logps/chosen": -385.20001220703125, "logps/rejected": -413.1499938964844, "loss": 0.021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.135546922683716, "rewards/margins": 8.920312881469727, "rewards/rejected": -11.057812690734863, "step": 8910 }, { "epoch": 2.3510806536636797, "grad_norm": 4.104079429901085, "learning_rate": 4.1229573010015813e-07, "logits/chosen": -0.3273559510707855, "logits/rejected": -0.629040539264679, "logps/chosen": -399.5, "logps/rejected": -471.6000061035156, "loss": 0.0401, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4530272483825684, "rewards/margins": 9.2421875, "rewards/rejected": -11.698437690734863, "step": 8920 }, { "epoch": 2.3537163943068, "grad_norm": 44.81524334344638, "learning_rate": 4.11636794939378e-07, "logits/chosen": -0.30720216035842896, "logits/rejected": -0.6252502202987671, "logps/chosen": -378.04998779296875, "logps/rejected": -420.79998779296875, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.037109375, "rewards/margins": 8.171875, "rewards/rejected": -10.209375381469727, "step": 8930 }, { "epoch": 2.356352134949921, "grad_norm": 12.774188765127198, "learning_rate": 4.109778597785978e-07, "logits/chosen": -0.39191895723342896, "logits/rejected": -0.4642089903354645, "logps/chosen": -416.75, "logps/rejected": -470.3999938964844, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -2.12109375, "rewards/margins": 8.287500381469727, "rewards/rejected": -10.409375190734863, "step": 8940 }, { "epoch": 2.3589878755930416, "grad_norm": 0.908200179711476, "learning_rate": 4.1031892461781757e-07, "logits/chosen": -0.39207762479782104, "logits/rejected": -0.6295410394668579, "logps/chosen": -379.1000061035156, "logps/rejected": -422.20001220703125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.656042456626892, "rewards/margins": 8.212499618530273, "rewards/rejected": -9.871874809265137, "step": 8950 }, { "epoch": 2.3616236162361623, "grad_norm": 23.482689851525965, "learning_rate": 4.0965998945703737e-07, "logits/chosen": -0.24830932915210724, "logits/rejected": -0.52685546875, "logps/chosen": -433.3500061035156, "logps/rejected": -464.70001220703125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.489355444908142, "rewards/margins": 8.3203125, "rewards/rejected": -9.8125, "step": 8960 }, { "epoch": 2.364259356879283, "grad_norm": 0.44698814587365854, "learning_rate": 4.0900105429625727e-07, "logits/chosen": -0.4020141661167145, "logits/rejected": -0.5230468511581421, "logps/chosen": -410.70001220703125, "logps/rejected": -440.0, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2677733898162842, "rewards/margins": 8.262499809265137, "rewards/rejected": -9.532812118530273, "step": 8970 }, { "epoch": 2.366895097522404, "grad_norm": 2.901916795397565, "learning_rate": 4.0834211913547707e-07, "logits/chosen": -0.3148193359375, "logits/rejected": -0.5255492925643921, "logps/chosen": -378.6000061035156, "logps/rejected": -465.6499938964844, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.4460937976837158, "rewards/margins": 9.159375190734863, "rewards/rejected": -10.609375, "step": 8980 }, { "epoch": 2.3695308381655247, "grad_norm": 33.9721618111374, "learning_rate": 4.0768318397469687e-07, "logits/chosen": -0.32432860136032104, "logits/rejected": -0.633135974407196, "logps/chosen": -359.6499938964844, "logps/rejected": -400.0, "loss": 0.0173, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.396813988685608, "rewards/margins": 8.2890625, "rewards/rejected": -9.6875, "step": 8990 }, { "epoch": 2.3721665788086455, "grad_norm": 2.0736593652448616, "learning_rate": 4.0702424881391666e-07, "logits/chosen": -0.2702392637729645, "logits/rejected": -0.6429198980331421, "logps/chosen": -388.8500061035156, "logps/rejected": -376.70001220703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -1.1925780773162842, "rewards/margins": 7.956250190734863, "rewards/rejected": -9.151562690734863, "step": 9000 }, { "epoch": 2.374802319451766, "grad_norm": 10.067703533546961, "learning_rate": 4.063653136531365e-07, "logits/chosen": -0.27907103300094604, "logits/rejected": -0.555938720703125, "logps/chosen": -386.8500061035156, "logps/rejected": -418.70001220703125, "loss": 0.0295, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6062500476837158, "rewards/margins": 8.439062118530273, "rewards/rejected": -10.048437118530273, "step": 9010 }, { "epoch": 2.3774380600948866, "grad_norm": 69.41731890813291, "learning_rate": 4.057063784923563e-07, "logits/chosen": -0.3470458984375, "logits/rejected": -0.45563966035842896, "logps/chosen": -369.1499938964844, "logps/rejected": -426.8999938964844, "loss": 0.0195, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3021240234375, "rewards/margins": 8.221875190734863, "rewards/rejected": -9.524999618530273, "step": 9020 }, { "epoch": 2.3800738007380073, "grad_norm": 1.0974963136479932, "learning_rate": 4.0504744333157616e-07, "logits/chosen": -0.286294549703598, "logits/rejected": -0.46992188692092896, "logps/chosen": -403.1499938964844, "logps/rejected": -451.95001220703125, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -1.240625023841858, "rewards/margins": 8.357812881469727, "rewards/rejected": -9.596875190734863, "step": 9030 }, { "epoch": 2.382709541381128, "grad_norm": 29.01640955826052, "learning_rate": 4.0438850817079596e-07, "logits/chosen": -0.27556151151657104, "logits/rejected": -0.552001953125, "logps/chosen": -380.79998779296875, "logps/rejected": -417.70001220703125, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.447851538658142, "rewards/margins": 8.159375190734863, "rewards/rejected": -9.604687690734863, "step": 9040 }, { "epoch": 2.385345282024249, "grad_norm": 18.8828155430424, "learning_rate": 4.037295730100158e-07, "logits/chosen": -0.23265381157398224, "logits/rejected": -0.5135498046875, "logps/chosen": -431.8999938964844, "logps/rejected": -467.1499938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.35302734375, "rewards/margins": 8.134374618530273, "rewards/rejected": -9.484375, "step": 9050 }, { "epoch": 2.3879810226673697, "grad_norm": 7.138631064028932, "learning_rate": 4.030706378492356e-07, "logits/chosen": -0.41374510526657104, "logits/rejected": -0.510498046875, "logps/chosen": -346.04998779296875, "logps/rejected": -409.25, "loss": 0.0166, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.509619116783142, "rewards/margins": 8.082812309265137, "rewards/rejected": -9.595312118530273, "step": 9060 }, { "epoch": 2.39061676331049, "grad_norm": 9.664700612426053, "learning_rate": 4.024117026884554e-07, "logits/chosen": -0.33295899629592896, "logits/rejected": -0.553387463092804, "logps/chosen": -386.45001220703125, "logps/rejected": -478.8999938964844, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.940771460533142, "rewards/margins": 8.512499809265137, "rewards/rejected": -10.451562881469727, "step": 9070 }, { "epoch": 2.3932525039536108, "grad_norm": 10.352620840273785, "learning_rate": 4.017527675276753e-07, "logits/chosen": -0.19111327826976776, "logits/rejected": -0.5412994623184204, "logps/chosen": -398.79998779296875, "logps/rejected": -435.1499938964844, "loss": 0.0244, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.876074194908142, "rewards/margins": 8.1640625, "rewards/rejected": -10.040624618530273, "step": 9080 }, { "epoch": 2.3958882445967316, "grad_norm": 33.60981020490735, "learning_rate": 4.010938323668951e-07, "logits/chosen": -0.22707518935203552, "logits/rejected": -0.49846190214157104, "logps/chosen": -397.29998779296875, "logps/rejected": -473.8999938964844, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.7957031726837158, "rewards/margins": 8.365625381469727, "rewards/rejected": -10.1640625, "step": 9090 }, { "epoch": 2.3985239852398523, "grad_norm": 30.352276705475997, "learning_rate": 4.004348972061149e-07, "logits/chosen": -0.3973022401332855, "logits/rejected": -0.561279296875, "logps/chosen": -353.3500061035156, "logps/rejected": -434.5, "loss": 0.0227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.738378882408142, "rewards/margins": 7.764062404632568, "rewards/rejected": -9.5078125, "step": 9100 }, { "epoch": 2.401159725882973, "grad_norm": 2.4153516337294247, "learning_rate": 3.997759620453347e-07, "logits/chosen": -0.3875976502895355, "logits/rejected": -0.60015869140625, "logps/chosen": -388.8500061035156, "logps/rejected": -441.0, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.442529320716858, "rewards/margins": 8.165624618530273, "rewards/rejected": -9.606249809265137, "step": 9110 }, { "epoch": 2.403795466526094, "grad_norm": 8.83008999252682, "learning_rate": 3.9911702688455455e-07, "logits/chosen": -0.08194275200366974, "logits/rejected": -0.6605468988418579, "logps/chosen": -414.5, "logps/rejected": -427.29998779296875, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.4836914539337158, "rewards/margins": 8.699999809265137, "rewards/rejected": -10.181249618530273, "step": 9120 }, { "epoch": 2.4064312071692147, "grad_norm": 2.313789589224406, "learning_rate": 3.9845809172377435e-07, "logits/chosen": -0.41077882051467896, "logits/rejected": -0.5966796875, "logps/chosen": -391.5, "logps/rejected": -444.1000061035156, "loss": 0.0126, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.361328125, "rewards/margins": 8.284375190734863, "rewards/rejected": -9.639062881469727, "step": 9130 }, { "epoch": 2.4090669478123354, "grad_norm": 9.815778425638218, "learning_rate": 3.977991565629942e-07, "logits/chosen": -0.32429200410842896, "logits/rejected": -0.46815794706344604, "logps/chosen": -420.8999938964844, "logps/rejected": -462.8999938964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.019995093345642, "rewards/margins": 8.381250381469727, "rewards/rejected": -9.399999618530273, "step": 9140 }, { "epoch": 2.411702688455456, "grad_norm": 5.357258404293557, "learning_rate": 3.97140221402214e-07, "logits/chosen": -0.29913330078125, "logits/rejected": -0.60614013671875, "logps/chosen": -428.20001220703125, "logps/rejected": -449.3999938964844, "loss": 0.0252, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1412110328674316, "rewards/margins": 7.78125, "rewards/rejected": -9.918749809265137, "step": 9150 }, { "epoch": 2.4143384290985765, "grad_norm": 9.834030350548904, "learning_rate": 3.9648128624143384e-07, "logits/chosen": -0.1820419281721115, "logits/rejected": -0.5857177972793579, "logps/chosen": -410.8999938964844, "logps/rejected": -452.79998779296875, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -1.7340819835662842, "rewards/margins": 8.385937690734863, "rewards/rejected": -10.123437881469727, "step": 9160 }, { "epoch": 2.4169741697416973, "grad_norm": 0.241024807582128, "learning_rate": 3.9582235108065364e-07, "logits/chosen": -0.47846680879592896, "logits/rejected": -0.703326404094696, "logps/chosen": -409.1000061035156, "logps/rejected": -448.8999938964844, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -2.031445264816284, "rewards/margins": 7.946875095367432, "rewards/rejected": -9.973437309265137, "step": 9170 }, { "epoch": 2.419609910384818, "grad_norm": 2.3103618504948678, "learning_rate": 3.9516341591987344e-07, "logits/chosen": -0.270407110452652, "logits/rejected": -0.6043335199356079, "logps/chosen": -407.8999938964844, "logps/rejected": -442.20001220703125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -1.3177978992462158, "rewards/margins": 8.160937309265137, "rewards/rejected": -9.4765625, "step": 9180 }, { "epoch": 2.422245651027939, "grad_norm": 24.34130764956091, "learning_rate": 3.9450448075909324e-07, "logits/chosen": -0.34471434354782104, "logits/rejected": -0.471923828125, "logps/chosen": -410.1499938964844, "logps/rejected": -500.5, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -2.012988328933716, "rewards/margins": 8.540624618530273, "rewards/rejected": -10.553125381469727, "step": 9190 }, { "epoch": 2.4248813916710596, "grad_norm": 127.976664239072, "learning_rate": 3.9384554559831314e-07, "logits/chosen": -0.22175903618335724, "logits/rejected": -0.6766357421875, "logps/chosen": -421.25, "logps/rejected": -456.1000061035156, "loss": 0.0225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.366894483566284, "rewards/margins": 8.743749618530273, "rewards/rejected": -11.112500190734863, "step": 9200 }, { "epoch": 2.4275171323141804, "grad_norm": 21.99594555505775, "learning_rate": 3.9318661043753294e-07, "logits/chosen": -0.3242202699184418, "logits/rejected": -0.5015014410018921, "logps/chosen": -409.1000061035156, "logps/rejected": -472.75, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -2.0365233421325684, "rewards/margins": 8.2578125, "rewards/rejected": -10.287500381469727, "step": 9210 }, { "epoch": 2.4301528729573008, "grad_norm": 2.176830740497588, "learning_rate": 3.9252767527675273e-07, "logits/chosen": -0.277090460062027, "logits/rejected": -0.4281860291957855, "logps/chosen": -438.04998779296875, "logps/rejected": -428.0, "loss": 0.0153, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.840722680091858, "rewards/margins": 8.303125381469727, "rewards/rejected": -10.143750190734863, "step": 9220 }, { "epoch": 2.4327886136004215, "grad_norm": 1.5048986668346371, "learning_rate": 3.918687401159726e-07, "logits/chosen": -0.276254266500473, "logits/rejected": -0.56201171875, "logps/chosen": -375.70001220703125, "logps/rejected": -424.29998779296875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.118457078933716, "rewards/margins": 8.282812118530273, "rewards/rejected": -10.401562690734863, "step": 9230 }, { "epoch": 2.4354243542435423, "grad_norm": 40.12786355440359, "learning_rate": 3.912098049551924e-07, "logits/chosen": -0.36848145723342896, "logits/rejected": -0.5035034418106079, "logps/chosen": -420.79998779296875, "logps/rejected": -442.8500061035156, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.671484351158142, "rewards/margins": 7.985937595367432, "rewards/rejected": -9.660937309265137, "step": 9240 }, { "epoch": 2.438060094886663, "grad_norm": 32.01728618182824, "learning_rate": 3.9055086979441223e-07, "logits/chosen": -0.37199705839157104, "logits/rejected": -0.4770751893520355, "logps/chosen": -353.29998779296875, "logps/rejected": -457.29998779296875, "loss": 0.0313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6184570789337158, "rewards/margins": 8.0703125, "rewards/rejected": -9.681249618530273, "step": 9250 }, { "epoch": 2.440695835529784, "grad_norm": 1.2678736192393454, "learning_rate": 3.8989193463363203e-07, "logits/chosen": -0.17626342177391052, "logits/rejected": -0.539794921875, "logps/chosen": -371.29998779296875, "logps/rejected": -460.29998779296875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -2.0943360328674316, "rewards/margins": 8.699999809265137, "rewards/rejected": -10.798437118530273, "step": 9260 }, { "epoch": 2.4433315761729046, "grad_norm": 0.6104229168558113, "learning_rate": 3.892329994728519e-07, "logits/chosen": -0.4684081971645355, "logits/rejected": -0.5710693597793579, "logps/chosen": -351.5, "logps/rejected": -448.79998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.853515625, "rewards/margins": 8.673437118530273, "rewards/rejected": -10.524999618530273, "step": 9270 }, { "epoch": 2.4459673168160254, "grad_norm": 1.5860376833347039, "learning_rate": 3.885740643120717e-07, "logits/chosen": -0.28376466035842896, "logits/rejected": -0.6533203125, "logps/chosen": -395.70001220703125, "logps/rejected": -487.79998779296875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.584649682044983, "rewards/margins": 8.832812309265137, "rewards/rejected": -10.418749809265137, "step": 9280 }, { "epoch": 2.448603057459146, "grad_norm": 38.07876058646008, "learning_rate": 3.879151291512915e-07, "logits/chosen": -0.20456543564796448, "logits/rejected": -0.3994995057582855, "logps/chosen": -399.8999938964844, "logps/rejected": -424.3999938964844, "loss": 0.0226, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.628637671470642, "rewards/margins": 7.862500190734863, "rewards/rejected": -9.484375, "step": 9290 }, { "epoch": 2.451238798102267, "grad_norm": 8.709043763686177, "learning_rate": 3.8725619399051127e-07, "logits/chosen": -0.30451661348342896, "logits/rejected": -0.642901599407196, "logps/chosen": -391.45001220703125, "logps/rejected": -418.5, "loss": 0.0173, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.611962914466858, "rewards/margins": 8.089062690734863, "rewards/rejected": -9.701562881469727, "step": 9300 }, { "epoch": 2.4538745387453873, "grad_norm": 1.3371802117812561, "learning_rate": 3.865972588297312e-07, "logits/chosen": -0.3817138671875, "logits/rejected": -0.6410156488418579, "logps/chosen": -313.5, "logps/rejected": -423.8999938964844, "loss": 0.0173, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.654638648033142, "rewards/margins": 8.0859375, "rewards/rejected": -9.7421875, "step": 9310 }, { "epoch": 2.456510279388508, "grad_norm": 21.853625652338927, "learning_rate": 3.8593832366895097e-07, "logits/chosen": -0.2995758056640625, "logits/rejected": -0.6685119867324829, "logps/chosen": -382.54998779296875, "logps/rejected": -434.79998779296875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.743554711341858, "rewards/margins": 8.064062118530273, "rewards/rejected": -9.809374809265137, "step": 9320 }, { "epoch": 2.459146020031629, "grad_norm": 14.185842866681783, "learning_rate": 3.8527938850817077e-07, "logits/chosen": -0.38776856660842896, "logits/rejected": -0.5084594488143921, "logps/chosen": -385.54998779296875, "logps/rejected": -419.79998779296875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -1.1686522960662842, "rewards/margins": 7.948437690734863, "rewards/rejected": -9.118749618530273, "step": 9330 }, { "epoch": 2.4617817606747496, "grad_norm": 0.3531164668152636, "learning_rate": 3.8462045334739057e-07, "logits/chosen": -0.3091064393520355, "logits/rejected": -0.587207019329071, "logps/chosen": -433.79998779296875, "logps/rejected": -437.3999938964844, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -1.525781273841858, "rewards/margins": 7.8984375, "rewards/rejected": -9.425000190734863, "step": 9340 }, { "epoch": 2.4644175013178704, "grad_norm": 3.573064929933413, "learning_rate": 3.839615181866104e-07, "logits/chosen": -0.23796996474266052, "logits/rejected": -0.534686267375946, "logps/chosen": -378.3999938964844, "logps/rejected": -413.04998779296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -2.092334032058716, "rewards/margins": 7.770312309265137, "rewards/rejected": -9.856249809265137, "step": 9350 }, { "epoch": 2.467053241960991, "grad_norm": 5.532496340869573, "learning_rate": 3.833025830258302e-07, "logits/chosen": -0.2643493711948395, "logits/rejected": -0.3980712890625, "logps/chosen": -382.75, "logps/rejected": -484.5, "loss": 0.0131, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.919091820716858, "rewards/margins": 8.793749809265137, "rewards/rejected": -10.715624809265137, "step": 9360 }, { "epoch": 2.4696889826041115, "grad_norm": 10.131557161265926, "learning_rate": 3.8264364786505006e-07, "logits/chosen": -0.24875488877296448, "logits/rejected": -0.546093761920929, "logps/chosen": -419.29998779296875, "logps/rejected": -425.1000061035156, "loss": 0.0212, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8596680164337158, "rewards/margins": 8.176562309265137, "rewards/rejected": -10.043749809265137, "step": 9370 }, { "epoch": 2.4723247232472323, "grad_norm": 16.551638012357753, "learning_rate": 3.819847127042699e-07, "logits/chosen": -0.24693603813648224, "logits/rejected": -0.5663818120956421, "logps/chosen": -366.1499938964844, "logps/rejected": -411.0, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.686743140220642, "rewards/margins": 8.431249618530273, "rewards/rejected": -10.115625381469727, "step": 9380 }, { "epoch": 2.474960463890353, "grad_norm": 6.631007429070526, "learning_rate": 3.813257775434897e-07, "logits/chosen": -0.215606689453125, "logits/rejected": -0.4335266053676605, "logps/chosen": -409.6499938964844, "logps/rejected": -461.6000061035156, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -2.0729002952575684, "rewards/margins": 8.810937881469727, "rewards/rejected": -10.887499809265137, "step": 9390 }, { "epoch": 2.477596204533474, "grad_norm": 5.217885436385699, "learning_rate": 3.806668423827095e-07, "logits/chosen": -0.39430540800094604, "logits/rejected": -0.568188488483429, "logps/chosen": -434.3999938964844, "logps/rejected": -493.70001220703125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.4014649391174316, "rewards/margins": 8.701562881469727, "rewards/rejected": -11.103124618530273, "step": 9400 }, { "epoch": 2.4802319451765946, "grad_norm": 10.016533867373125, "learning_rate": 3.800079072219293e-07, "logits/chosen": -0.19831542670726776, "logits/rejected": -0.45964354276657104, "logps/chosen": -412.75, "logps/rejected": -438.3500061035156, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -1.9298827648162842, "rewards/margins": 8.829687118530273, "rewards/rejected": -10.751562118530273, "step": 9410 }, { "epoch": 2.4828676858197154, "grad_norm": 10.259317249430849, "learning_rate": 3.793489720611492e-07, "logits/chosen": -0.3504882752895355, "logits/rejected": -0.639892578125, "logps/chosen": -357.0, "logps/rejected": -432.5, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -2.4859375953674316, "rewards/margins": 8.34375, "rewards/rejected": -10.828125, "step": 9420 }, { "epoch": 2.485503426462836, "grad_norm": 1.2103279118797476, "learning_rate": 3.78690036900369e-07, "logits/chosen": -0.32038575410842896, "logits/rejected": -0.45166015625, "logps/chosen": -370.45001220703125, "logps/rejected": -446.8999938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.67724609375, "rewards/margins": 8.876562118530273, "rewards/rejected": -10.559374809265137, "step": 9430 }, { "epoch": 2.488139167105957, "grad_norm": 0.8542116864834, "learning_rate": 3.780311017395888e-07, "logits/chosen": -0.38190919160842896, "logits/rejected": -0.5828720331192017, "logps/chosen": -367.04998779296875, "logps/rejected": -445.20001220703125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -1.617919921875, "rewards/margins": 8.957812309265137, "rewards/rejected": -10.578125, "step": 9440 }, { "epoch": 2.4907749077490777, "grad_norm": 1.6534264021044414, "learning_rate": 3.773721665788086e-07, "logits/chosen": -0.25065916776657104, "logits/rejected": -0.6278076171875, "logps/chosen": -395.8500061035156, "logps/rejected": -466.0, "loss": 0.0329, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.942968726158142, "rewards/margins": 8.576562881469727, "rewards/rejected": -10.512499809265137, "step": 9450 }, { "epoch": 2.493410648392198, "grad_norm": 13.907161969386184, "learning_rate": 3.7671323141802845e-07, "logits/chosen": 0.0077758789993822575, "logits/rejected": -0.34733277559280396, "logps/chosen": -409.45001220703125, "logps/rejected": -478.0, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -1.8157227039337158, "rewards/margins": 8.665624618530273, "rewards/rejected": -10.4765625, "step": 9460 }, { "epoch": 2.496046389035319, "grad_norm": 5.527100712391087, "learning_rate": 3.7605429625724825e-07, "logits/chosen": -0.26698607206344604, "logits/rejected": -0.655224621295929, "logps/chosen": -383.45001220703125, "logps/rejected": -444.1499938964844, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.9560058116912842, "rewards/margins": 8.707812309265137, "rewards/rejected": -10.659375190734863, "step": 9470 }, { "epoch": 2.4986821296784396, "grad_norm": 9.310029177611304, "learning_rate": 3.753953610964681e-07, "logits/chosen": -0.2120361328125, "logits/rejected": -0.559521496295929, "logps/chosen": -394.3999938964844, "logps/rejected": -453.3999938964844, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -2.1460938453674316, "rewards/margins": 8.8125, "rewards/rejected": -10.956250190734863, "step": 9480 }, { "epoch": 2.5013178703215604, "grad_norm": 1.0360562101944222, "learning_rate": 3.7473642593568795e-07, "logits/chosen": -0.3230529725551605, "logits/rejected": -0.51318359375, "logps/chosen": -401.8500061035156, "logps/rejected": -485.54998779296875, "loss": 0.0374, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.09423828125, "rewards/margins": 8.389062881469727, "rewards/rejected": -10.485937118530273, "step": 9490 }, { "epoch": 2.503953610964681, "grad_norm": 144.53362510097602, "learning_rate": 3.7407749077490775e-07, "logits/chosen": -0.26606446504592896, "logits/rejected": -0.558642566204071, "logps/chosen": -349.45001220703125, "logps/rejected": -429.5, "loss": 0.0182, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8337891101837158, "rewards/margins": 8.537500381469727, "rewards/rejected": -10.368749618530273, "step": 9500 }, { "epoch": 2.506589351607802, "grad_norm": 6.557256354516561, "learning_rate": 3.7341855561412754e-07, "logits/chosen": -0.4317260682582855, "logits/rejected": -0.760302722454071, "logps/chosen": -374.29998779296875, "logps/rejected": -405.70001220703125, "loss": 0.0113, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.166186571121216, "rewards/margins": 8.21875, "rewards/rejected": -10.396875381469727, "step": 9510 }, { "epoch": 2.5092250922509223, "grad_norm": 5.703816498820012, "learning_rate": 3.7275962045334734e-07, "logits/chosen": -0.34980469942092896, "logits/rejected": -0.43421632051467896, "logps/chosen": -387.1499938964844, "logps/rejected": -432.8999938964844, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.53448486328125, "rewards/margins": 8.459375381469727, "rewards/rejected": -9.996874809265137, "step": 9520 }, { "epoch": 2.511860832894043, "grad_norm": 132.50817379999248, "learning_rate": 3.721006852925672e-07, "logits/chosen": -0.21009521186351776, "logits/rejected": -0.3915420472621918, "logps/chosen": -404.8999938964844, "logps/rejected": -441.0, "loss": 0.0261, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7083008289337158, "rewards/margins": 8.271875381469727, "rewards/rejected": -9.982812881469727, "step": 9530 }, { "epoch": 2.514496573537164, "grad_norm": 74.25631580881496, "learning_rate": 3.7144175013178704e-07, "logits/chosen": -0.35490721464157104, "logits/rejected": -0.49876099824905396, "logps/chosen": -372.5249938964844, "logps/rejected": -449.6000061035156, "loss": 0.0243, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.690185546875, "rewards/margins": 8.4609375, "rewards/rejected": -10.140625, "step": 9540 }, { "epoch": 2.5171323141802846, "grad_norm": 0.5311310469040611, "learning_rate": 3.7078281497100684e-07, "logits/chosen": -0.38761597871780396, "logits/rejected": -0.64056396484375, "logps/chosen": -409.0, "logps/rejected": -424.8500061035156, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -1.76953125, "rewards/margins": 7.896874904632568, "rewards/rejected": -9.670312881469727, "step": 9550 }, { "epoch": 2.5197680548234054, "grad_norm": 0.81143385639811, "learning_rate": 3.7012387981022664e-07, "logits/chosen": -0.36867982149124146, "logits/rejected": -0.64892578125, "logps/chosen": -389.95001220703125, "logps/rejected": -426.29998779296875, "loss": 0.0302, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4553711414337158, "rewards/margins": 7.974999904632568, "rewards/rejected": -9.431249618530273, "step": 9560 }, { "epoch": 2.522403795466526, "grad_norm": 191.43779216535182, "learning_rate": 3.694649446494465e-07, "logits/chosen": -0.4426818788051605, "logits/rejected": -0.5318847894668579, "logps/chosen": -384.0, "logps/rejected": -464.70001220703125, "loss": 0.0336, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9480469226837158, "rewards/margins": 7.839062690734863, "rewards/rejected": -9.787500381469727, "step": 9570 }, { "epoch": 2.525039536109647, "grad_norm": 1.7260768878093145, "learning_rate": 3.688060094886663e-07, "logits/chosen": -0.30531007051467896, "logits/rejected": -0.6961914300918579, "logps/chosen": -379.3500061035156, "logps/rejected": -465.3999938964844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.123156785964966, "rewards/margins": 8.770312309265137, "rewards/rejected": -10.890625, "step": 9580 }, { "epoch": 2.5276752767527677, "grad_norm": 0.8124637622406192, "learning_rate": 3.6814707432788613e-07, "logits/chosen": -0.4315338134765625, "logits/rejected": -0.727612316608429, "logps/chosen": -379.6000061035156, "logps/rejected": -435.70001220703125, "loss": 0.0225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.150439500808716, "rewards/margins": 7.884375095367432, "rewards/rejected": -10.040624618530273, "step": 9590 }, { "epoch": 2.5303110173958885, "grad_norm": 2.8268453992738634, "learning_rate": 3.6748813916710593e-07, "logits/chosen": -0.3180603086948395, "logits/rejected": -0.5353759527206421, "logps/chosen": -400.20001220703125, "logps/rejected": -430.3999938964844, "loss": 0.0126, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9145996570587158, "rewards/margins": 8.221875190734863, "rewards/rejected": -10.128125190734863, "step": 9600 }, { "epoch": 2.532946758039009, "grad_norm": 99.67282409996861, "learning_rate": 3.668292040063258e-07, "logits/chosen": -0.5127830505371094, "logits/rejected": -0.5362488031387329, "logps/chosen": -357.75, "logps/rejected": -450.6499938964844, "loss": 0.0432, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7981445789337158, "rewards/margins": 7.800000190734863, "rewards/rejected": -9.598437309265137, "step": 9610 }, { "epoch": 2.5355824986821296, "grad_norm": 2.030321921700713, "learning_rate": 3.661702688455456e-07, "logits/chosen": -0.13692626357078552, "logits/rejected": -0.5108886957168579, "logps/chosen": -442.3500061035156, "logps/rejected": -445.70001220703125, "loss": 0.0182, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4568359851837158, "rewards/margins": 7.620312690734863, "rewards/rejected": -9.084375381469727, "step": 9620 }, { "epoch": 2.5382182393252504, "grad_norm": 4.3966689550638325, "learning_rate": 3.655113336847654e-07, "logits/chosen": -0.37150269746780396, "logits/rejected": -0.582043468952179, "logps/chosen": -363.5, "logps/rejected": -431.70001220703125, "loss": 0.0124, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9874999523162842, "rewards/margins": 8.446874618530273, "rewards/rejected": -10.434374809265137, "step": 9630 }, { "epoch": 2.540853979968371, "grad_norm": 0.6065081275545849, "learning_rate": 3.6485239852398523e-07, "logits/chosen": -0.24857178330421448, "logits/rejected": -0.4835205078125, "logps/chosen": -382.25, "logps/rejected": -425.54998779296875, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8083007335662842, "rewards/margins": 8.353124618530273, "rewards/rejected": -10.165624618530273, "step": 9640 }, { "epoch": 2.543489720611492, "grad_norm": 4.449481494331649, "learning_rate": 3.641934633632051e-07, "logits/chosen": -0.17802734673023224, "logits/rejected": -0.7081543207168579, "logps/chosen": -352.54998779296875, "logps/rejected": -421.1000061035156, "loss": 0.0195, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9613434076309204, "rewards/margins": 8.739062309265137, "rewards/rejected": -10.706250190734863, "step": 9650 }, { "epoch": 2.5461254612546127, "grad_norm": 2.3249906426785873, "learning_rate": 3.635345282024249e-07, "logits/chosen": -0.14650878310203552, "logits/rejected": -0.595507800579071, "logps/chosen": -346.75, "logps/rejected": -399.45001220703125, "loss": 0.0211, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.91064453125, "rewards/margins": 8.885937690734863, "rewards/rejected": -10.793749809265137, "step": 9660 }, { "epoch": 2.548761201897733, "grad_norm": 9.232954500022306, "learning_rate": 3.6287559304164467e-07, "logits/chosen": -0.3374694883823395, "logits/rejected": -0.666259765625, "logps/chosen": -339.1499938964844, "logps/rejected": -404.8999938964844, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.973535180091858, "rewards/margins": 8.15625, "rewards/rejected": -10.134374618530273, "step": 9670 }, { "epoch": 2.551396942540854, "grad_norm": 1.7479720911676138, "learning_rate": 3.622166578808645e-07, "logits/chosen": -0.42646485567092896, "logits/rejected": -0.702929675579071, "logps/chosen": -402.54998779296875, "logps/rejected": -431.6000061035156, "loss": 0.0257, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5463378429412842, "rewards/margins": 8.15625, "rewards/rejected": -9.704687118530273, "step": 9680 }, { "epoch": 2.5540326831839746, "grad_norm": 42.13915880751037, "learning_rate": 3.615577227200843e-07, "logits/chosen": -0.39912718534469604, "logits/rejected": -0.63232421875, "logps/chosen": -418.5, "logps/rejected": -436.29998779296875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -1.9107544422149658, "rewards/margins": 8.225000381469727, "rewards/rejected": -10.134374618530273, "step": 9690 }, { "epoch": 2.5566684238270954, "grad_norm": 2.6247947446874376, "learning_rate": 3.608987875593041e-07, "logits/chosen": -0.38874512910842896, "logits/rejected": -0.5304809808731079, "logps/chosen": -383.70001220703125, "logps/rejected": -467.8999938964844, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5265624523162842, "rewards/margins": 8.489062309265137, "rewards/rejected": -10.012499809265137, "step": 9700 }, { "epoch": 2.559304164470216, "grad_norm": 16.08533622900547, "learning_rate": 3.6023985239852397e-07, "logits/chosen": -0.30759888887405396, "logits/rejected": -0.5373169183731079, "logps/chosen": -393.6499938964844, "logps/rejected": -449.70001220703125, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.791406273841858, "rewards/margins": 8.331250190734863, "rewards/rejected": -10.128125190734863, "step": 9710 }, { "epoch": 2.561939905113337, "grad_norm": 27.335818483125855, "learning_rate": 3.595809172377438e-07, "logits/chosen": -0.23891600966453552, "logits/rejected": -0.4970703125, "logps/chosen": -372.0, "logps/rejected": -408.54998779296875, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.416894555091858, "rewards/margins": 7.745312690734863, "rewards/rejected": -9.162500381469727, "step": 9720 }, { "epoch": 2.5645756457564577, "grad_norm": 2.7540612998979075, "learning_rate": 3.589219820769636e-07, "logits/chosen": -0.3575195372104645, "logits/rejected": -0.564648449420929, "logps/chosen": -370.5, "logps/rejected": -457.1000061035156, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2051513195037842, "rewards/margins": 8.479687690734863, "rewards/rejected": -9.6875, "step": 9730 }, { "epoch": 2.5672113863995785, "grad_norm": 0.5523738839275947, "learning_rate": 3.582630469161834e-07, "logits/chosen": -0.35604554414749146, "logits/rejected": -0.5630859136581421, "logps/chosen": -382.0, "logps/rejected": -458.0, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.513494849205017, "rewards/margins": 8.831250190734863, "rewards/rejected": -10.3515625, "step": 9740 }, { "epoch": 2.5698471270426992, "grad_norm": 39.05260503759925, "learning_rate": 3.576041117554032e-07, "logits/chosen": -0.2564453184604645, "logits/rejected": -0.495147705078125, "logps/chosen": -374.29998779296875, "logps/rejected": -449.6000061035156, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.4263184070587158, "rewards/margins": 8.0703125, "rewards/rejected": -9.489062309265137, "step": 9750 }, { "epoch": 2.5724828676858196, "grad_norm": 11.37710615254536, "learning_rate": 3.569451765946231e-07, "logits/chosen": -0.15837402641773224, "logits/rejected": -0.576171875, "logps/chosen": -415.29998779296875, "logps/rejected": -454.5, "loss": 0.0351, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.933203101158142, "rewards/margins": 8.543749809265137, "rewards/rejected": -10.478124618530273, "step": 9760 }, { "epoch": 2.5751186083289404, "grad_norm": 3.066417614534665, "learning_rate": 3.562862414338429e-07, "logits/chosen": -0.010638427920639515, "logits/rejected": -0.30644530057907104, "logps/chosen": -381.20001220703125, "logps/rejected": -474.3999938964844, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.4280273914337158, "rewards/margins": 8.096875190734863, "rewards/rejected": -9.524999618530273, "step": 9770 }, { "epoch": 2.577754348972061, "grad_norm": 5.32059697739374, "learning_rate": 3.556273062730627e-07, "logits/chosen": -0.33916014432907104, "logits/rejected": -0.5479980707168579, "logps/chosen": -366.6499938964844, "logps/rejected": -439.8999938964844, "loss": 0.0171, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.499609351158142, "rewards/margins": 8.657812118530273, "rewards/rejected": -10.157812118530273, "step": 9780 }, { "epoch": 2.580390089615182, "grad_norm": 8.626490510602824, "learning_rate": 3.5496837111228256e-07, "logits/chosen": -0.2919921875, "logits/rejected": -0.614062488079071, "logps/chosen": -371.8999938964844, "logps/rejected": -468.29998779296875, "loss": 0.0381, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.484570264816284, "rewards/margins": 8.6328125, "rewards/rejected": -11.112500190734863, "step": 9790 }, { "epoch": 2.5830258302583027, "grad_norm": 3.2548748579055924, "learning_rate": 3.5430943595150235e-07, "logits/chosen": -0.41755372285842896, "logits/rejected": -0.6375976800918579, "logps/chosen": -431.0, "logps/rejected": -464.29998779296875, "loss": 0.0174, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.300585985183716, "rewards/margins": 8.332812309265137, "rewards/rejected": -10.637499809265137, "step": 9800 }, { "epoch": 2.5856615709014235, "grad_norm": 10.088882689647454, "learning_rate": 3.5365050079072215e-07, "logits/chosen": -0.2529540956020355, "logits/rejected": -0.43559569120407104, "logps/chosen": -349.1499938964844, "logps/rejected": -443.70001220703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596190929412842, "rewards/margins": 8.2578125, "rewards/rejected": -9.910937309265137, "step": 9810 }, { "epoch": 2.588297311544544, "grad_norm": 4.529638792122532, "learning_rate": 3.52991565629942e-07, "logits/chosen": -0.2856811583042145, "logits/rejected": -0.625781238079071, "logps/chosen": -350.79998779296875, "logps/rejected": -419.3999938964844, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.251953125, "rewards/margins": 7.712500095367432, "rewards/rejected": -9.9609375, "step": 9820 }, { "epoch": 2.5909330521876646, "grad_norm": 3.950859611833643, "learning_rate": 3.5233263046916185e-07, "logits/chosen": -0.3160034120082855, "logits/rejected": -0.764514148235321, "logps/chosen": -424.8999938964844, "logps/rejected": -419.0, "loss": 0.0177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.824462890625, "rewards/margins": 8.662500381469727, "rewards/rejected": -10.487500190734863, "step": 9830 }, { "epoch": 2.5935687928307853, "grad_norm": 7.017620120427696, "learning_rate": 3.5167369530838165e-07, "logits/chosen": -0.29155272245407104, "logits/rejected": -0.49566650390625, "logps/chosen": -398.0, "logps/rejected": -450.70001220703125, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.2015624046325684, "rewards/margins": 8.321874618530273, "rewards/rejected": -10.521875381469727, "step": 9840 }, { "epoch": 2.596204533473906, "grad_norm": 0.5007257179165561, "learning_rate": 3.5101476014760145e-07, "logits/chosen": -0.40385740995407104, "logits/rejected": -0.56231689453125, "logps/chosen": -391.95001220703125, "logps/rejected": -453.79998779296875, "loss": 0.0274, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9994628429412842, "rewards/margins": 8.456250190734863, "rewards/rejected": -10.449999809265137, "step": 9850 }, { "epoch": 2.598840274117027, "grad_norm": 0.5966058794886929, "learning_rate": 3.5035582498682124e-07, "logits/chosen": -0.36939698457717896, "logits/rejected": -0.663134753704071, "logps/chosen": -366.0, "logps/rejected": -412.3999938964844, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.076367139816284, "rewards/margins": 8.809374809265137, "rewards/rejected": -10.884374618530273, "step": 9860 }, { "epoch": 2.6014760147601477, "grad_norm": 1.1469487996106933, "learning_rate": 3.496968898260411e-07, "logits/chosen": -0.42548829317092896, "logits/rejected": -0.4908203184604645, "logps/chosen": -406.6499938964844, "logps/rejected": -461.70001220703125, "loss": 0.0256, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.689697265625, "rewards/margins": 8.5859375, "rewards/rejected": -10.28125, "step": 9870 }, { "epoch": 2.6041117554032684, "grad_norm": 12.779554324350478, "learning_rate": 3.4903795466526094e-07, "logits/chosen": -0.3479858338832855, "logits/rejected": -0.4957519471645355, "logps/chosen": -419.0, "logps/rejected": -470.79998779296875, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.787500023841858, "rewards/margins": 8.59375, "rewards/rejected": -10.381250381469727, "step": 9880 }, { "epoch": 2.6067474960463892, "grad_norm": 5.634302595297186, "learning_rate": 3.4837901950448074e-07, "logits/chosen": -0.36741942167282104, "logits/rejected": -0.635302722454071, "logps/chosen": -361.29998779296875, "logps/rejected": -405.95001220703125, "loss": 0.0154, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.440820336341858, "rewards/margins": 8.040624618530273, "rewards/rejected": -9.475000381469727, "step": 9890 }, { "epoch": 2.60938323668951, "grad_norm": 5.111516097649816, "learning_rate": 3.4772008434370054e-07, "logits/chosen": -0.34516602754592896, "logits/rejected": -0.479248046875, "logps/chosen": -384.95001220703125, "logps/rejected": -464.8999938964844, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9480469226837158, "rewards/margins": 7.993750095367432, "rewards/rejected": -9.939062118530273, "step": 9900 }, { "epoch": 2.6120189773326303, "grad_norm": 0.7301447330987877, "learning_rate": 3.470611491829204e-07, "logits/chosen": -0.20399780571460724, "logits/rejected": -0.4256530702114105, "logps/chosen": -393.25, "logps/rejected": -460.70001220703125, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.2626953125, "rewards/margins": 8.379687309265137, "rewards/rejected": -9.631250381469727, "step": 9910 }, { "epoch": 2.614654717975751, "grad_norm": 1.6722370793037529, "learning_rate": 3.464022140221402e-07, "logits/chosen": -0.525561511516571, "logits/rejected": -0.635546863079071, "logps/chosen": -369.8500061035156, "logps/rejected": -425.25, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -2.226757764816284, "rewards/margins": 8.056249618530273, "rewards/rejected": -10.285937309265137, "step": 9920 }, { "epoch": 2.617290458618872, "grad_norm": 5.91936203590413, "learning_rate": 3.4574327886136004e-07, "logits/chosen": -0.3058105409145355, "logits/rejected": -0.4312744140625, "logps/chosen": -354.29998779296875, "logps/rejected": -434.5, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.806054711341858, "rewards/margins": 8.140625, "rewards/rejected": -9.943750381469727, "step": 9930 }, { "epoch": 2.6199261992619927, "grad_norm": 77.06891588000343, "learning_rate": 3.450843437005799e-07, "logits/chosen": -0.31964111328125, "logits/rejected": -0.71142578125, "logps/chosen": -402.25, "logps/rejected": -458.5, "loss": 0.0242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.830419898033142, "rewards/margins": 8.765625, "rewards/rejected": -10.609375, "step": 9940 }, { "epoch": 2.6225619399051134, "grad_norm": 3.520915546866842, "learning_rate": 3.444254085397997e-07, "logits/chosen": -0.4822753965854645, "logits/rejected": -0.552734375, "logps/chosen": -346.0, "logps/rejected": -448.5, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -2.132617235183716, "rewards/margins": 8.887499809265137, "rewards/rejected": -11.021875381469727, "step": 9950 }, { "epoch": 2.625197680548234, "grad_norm": 7.731550591218527, "learning_rate": 3.437664733790195e-07, "logits/chosen": -0.32355958223342896, "logits/rejected": -0.6316772699356079, "logps/chosen": -356.54998779296875, "logps/rejected": -396.25, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -1.4984862804412842, "rewards/margins": 8.4296875, "rewards/rejected": -9.931249618530273, "step": 9960 }, { "epoch": 2.6278334211913545, "grad_norm": 2.2719504374251587, "learning_rate": 3.431075382182393e-07, "logits/chosen": -0.349273681640625, "logits/rejected": -0.537976086139679, "logps/chosen": -346.6499938964844, "logps/rejected": -446.45001220703125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -2.08447265625, "rewards/margins": 8.734375, "rewards/rejected": -10.826562881469727, "step": 9970 }, { "epoch": 2.6304691618344753, "grad_norm": 10.570543614769239, "learning_rate": 3.4244860305745913e-07, "logits/chosen": -0.44868165254592896, "logits/rejected": -0.6635497808456421, "logps/chosen": -409.79998779296875, "logps/rejected": -439.8999938964844, "loss": 0.0257, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6752440929412842, "rewards/margins": 8.457812309265137, "rewards/rejected": -10.1328125, "step": 9980 }, { "epoch": 2.633104902477596, "grad_norm": 3.6710775390754957, "learning_rate": 3.41789667896679e-07, "logits/chosen": -0.32902830839157104, "logits/rejected": -0.573559582233429, "logps/chosen": -393.8999938964844, "logps/rejected": -431.1000061035156, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.7109375, "rewards/margins": 8.496874809265137, "rewards/rejected": -10.221875190734863, "step": 9990 }, { "epoch": 2.635740643120717, "grad_norm": 13.935544244323884, "learning_rate": 3.411307327358988e-07, "logits/chosen": -0.556689441204071, "logits/rejected": -0.6964355707168579, "logps/chosen": -389.54998779296875, "logps/rejected": -480.70001220703125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.806542992591858, "rewards/margins": 8.671875, "rewards/rejected": -10.475000381469727, "step": 10000 }, { "epoch": 2.6383763837638377, "grad_norm": 5.725979379563632, "learning_rate": 3.404717975751186e-07, "logits/chosen": -0.216217041015625, "logits/rejected": -0.6399902105331421, "logps/chosen": -426.54998779296875, "logps/rejected": -487.29998779296875, "loss": 0.0141, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.915673851966858, "rewards/margins": 8.542187690734863, "rewards/rejected": -10.4609375, "step": 10010 }, { "epoch": 2.6410121244069584, "grad_norm": 2.3511079358771196, "learning_rate": 3.398128624143384e-07, "logits/chosen": -0.592822253704071, "logits/rejected": -0.764843761920929, "logps/chosen": -381.75, "logps/rejected": -426.6000061035156, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.758691430091858, "rewards/margins": 8.662500381469727, "rewards/rejected": -10.415624618530273, "step": 10020 }, { "epoch": 2.643647865050079, "grad_norm": 3.676353981996979, "learning_rate": 3.391539272535582e-07, "logits/chosen": -0.3766845762729645, "logits/rejected": -0.5720764398574829, "logps/chosen": -409.6499938964844, "logps/rejected": -483.8999938964844, "loss": 0.0172, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9998047351837158, "rewards/margins": 8.074999809265137, "rewards/rejected": -10.071874618530273, "step": 10030 }, { "epoch": 2.6462836056932, "grad_norm": 0.5467474398092967, "learning_rate": 3.38494992092778e-07, "logits/chosen": -0.3292236328125, "logits/rejected": -0.550707995891571, "logps/chosen": -360.8500061035156, "logps/rejected": -471.29998779296875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -1.46356201171875, "rewards/margins": 8.496874809265137, "rewards/rejected": -9.964062690734863, "step": 10040 }, { "epoch": 2.6489193463363208, "grad_norm": 3.0952634171033266, "learning_rate": 3.378360569319979e-07, "logits/chosen": -0.3634033203125, "logits/rejected": -0.601275622844696, "logps/chosen": -372.95001220703125, "logps/rejected": -441.3999938964844, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -1.4796874523162842, "rewards/margins": 8.579687118530273, "rewards/rejected": -10.059374809265137, "step": 10050 }, { "epoch": 2.651555086979441, "grad_norm": 0.8618294500991285, "learning_rate": 3.371771217712177e-07, "logits/chosen": -0.41765135526657104, "logits/rejected": -0.692626953125, "logps/chosen": -356.3500061035156, "logps/rejected": -454.1000061035156, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.900976538658142, "rewards/margins": 8.9765625, "rewards/rejected": -10.875, "step": 10060 }, { "epoch": 2.654190827622562, "grad_norm": 2.291695617643009, "learning_rate": 3.365181866104375e-07, "logits/chosen": -0.3662109375, "logits/rejected": -0.580126941204071, "logps/chosen": -388.32501220703125, "logps/rejected": -435.3999938964844, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -1.8546874523162842, "rewards/margins": 8.5390625, "rewards/rejected": -10.395312309265137, "step": 10070 }, { "epoch": 2.6568265682656826, "grad_norm": 1.479410394928282, "learning_rate": 3.358592514496573e-07, "logits/chosen": -0.4124511778354645, "logits/rejected": -0.691845715045929, "logps/chosen": -367.1000061035156, "logps/rejected": -435.20001220703125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -2.325927734375, "rewards/margins": 8.579687118530273, "rewards/rejected": -10.901562690734863, "step": 10080 }, { "epoch": 2.6594623089088034, "grad_norm": 1.8090300767957856, "learning_rate": 3.3520031628887716e-07, "logits/chosen": -0.45097047090530396, "logits/rejected": -0.697070300579071, "logps/chosen": -427.95001220703125, "logps/rejected": -473.79998779296875, "loss": 0.0165, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.806249976158142, "rewards/margins": 8.509374618530273, "rewards/rejected": -10.307812690734863, "step": 10090 }, { "epoch": 2.662098049551924, "grad_norm": 88.04479424724992, "learning_rate": 3.34541381128097e-07, "logits/chosen": -0.452890008687973, "logits/rejected": -0.7215820550918579, "logps/chosen": -393.6000061035156, "logps/rejected": -441.8999938964844, "loss": 0.0155, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.786450147628784, "rewards/margins": 8.623437881469727, "rewards/rejected": -11.409375190734863, "step": 10100 }, { "epoch": 2.664733790195045, "grad_norm": 16.594169112321353, "learning_rate": 3.338824459673168e-07, "logits/chosen": -0.4603820741176605, "logits/rejected": -0.7343994379043579, "logps/chosen": -394.95001220703125, "logps/rejected": -422.6000061035156, "loss": 0.0375, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0595703125, "rewards/margins": 7.953125, "rewards/rejected": -11.021875381469727, "step": 10110 }, { "epoch": 2.6673695308381653, "grad_norm": 305.2474470535191, "learning_rate": 3.332235108065366e-07, "logits/chosen": -0.3328613340854645, "logits/rejected": -0.745166003704071, "logps/chosen": -445.54998779296875, "logps/rejected": -496.1000061035156, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4527344703674316, "rewards/margins": 9.2265625, "rewards/rejected": -11.678125381469727, "step": 10120 }, { "epoch": 2.670005271481286, "grad_norm": 10.318378637774572, "learning_rate": 3.3256457564575646e-07, "logits/chosen": -0.441650390625, "logits/rejected": -0.604541003704071, "logps/chosen": -369.25, "logps/rejected": -443.75, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.2430663108825684, "rewards/margins": 8.873437881469727, "rewards/rejected": -11.115625381469727, "step": 10130 }, { "epoch": 2.672641012124407, "grad_norm": 16.354231527833697, "learning_rate": 3.3190564048497626e-07, "logits/chosen": -0.46123045682907104, "logits/rejected": -0.6141113042831421, "logps/chosen": -365.54998779296875, "logps/rejected": -443.1000061035156, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -2.545703172683716, "rewards/margins": 8.7578125, "rewards/rejected": -11.300000190734863, "step": 10140 }, { "epoch": 2.6752767527675276, "grad_norm": 2.3673512436786397, "learning_rate": 3.3124670532419605e-07, "logits/chosen": -0.4267211854457855, "logits/rejected": -0.6205078363418579, "logps/chosen": -397.8999938964844, "logps/rejected": -424.29998779296875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -2.2671265602111816, "rewards/margins": 7.778124809265137, "rewards/rejected": -10.043749809265137, "step": 10150 }, { "epoch": 2.6779124934106484, "grad_norm": 14.986568048195354, "learning_rate": 3.305877701634159e-07, "logits/chosen": -0.588549792766571, "logits/rejected": -0.7347656488418579, "logps/chosen": -389.6499938964844, "logps/rejected": -443.5, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.212695360183716, "rewards/margins": 8.918749809265137, "rewards/rejected": -11.128125190734863, "step": 10160 }, { "epoch": 2.680548234053769, "grad_norm": 6.789506819474046, "learning_rate": 3.2992883500263575e-07, "logits/chosen": -0.3780761659145355, "logits/rejected": -0.571606457233429, "logps/chosen": -404.45001220703125, "logps/rejected": -466.5, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.416015625, "rewards/margins": 8.731249809265137, "rewards/rejected": -11.143750190734863, "step": 10170 }, { "epoch": 2.68318397469689, "grad_norm": 4.13054723834128, "learning_rate": 3.2926989984185555e-07, "logits/chosen": -0.34553223848342896, "logits/rejected": -0.6512695550918579, "logps/chosen": -409.6000061035156, "logps/rejected": -443.29998779296875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.557031273841858, "rewards/margins": 8.671875, "rewards/rejected": -10.228124618530273, "step": 10180 }, { "epoch": 2.6858197153400107, "grad_norm": 5.736221137370455, "learning_rate": 3.2861096468107535e-07, "logits/chosen": -0.2822509706020355, "logits/rejected": -0.60394287109375, "logps/chosen": -456.1000061035156, "logps/rejected": -490.8999938964844, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.8987305164337158, "rewards/margins": 9.115625381469727, "rewards/rejected": -11.018750190734863, "step": 10190 }, { "epoch": 2.6884554559831315, "grad_norm": 20.89205593217189, "learning_rate": 3.279520295202952e-07, "logits/chosen": -0.5414794683456421, "logits/rejected": -0.748974621295929, "logps/chosen": -375.3500061035156, "logps/rejected": -404.95001220703125, "loss": 0.029, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6156249046325684, "rewards/margins": 8.259374618530273, "rewards/rejected": -10.868749618530273, "step": 10200 }, { "epoch": 2.691091196626252, "grad_norm": 1.050315531467532, "learning_rate": 3.27293094359515e-07, "logits/chosen": -0.4379638731479645, "logits/rejected": -0.62579345703125, "logps/chosen": -370.20001220703125, "logps/rejected": -419.20001220703125, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.450390577316284, "rewards/margins": 8.396875381469727, "rewards/rejected": -10.84375, "step": 10210 }, { "epoch": 2.6937269372693726, "grad_norm": 12.130732397230023, "learning_rate": 3.2663415919873485e-07, "logits/chosen": -0.3624206483364105, "logits/rejected": -0.653515636920929, "logps/chosen": -402.79998779296875, "logps/rejected": -493.1000061035156, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0038084983825684, "rewards/margins": 8.740625381469727, "rewards/rejected": -10.751562118530273, "step": 10220 }, { "epoch": 2.6963626779124934, "grad_norm": 3.2998341716637984, "learning_rate": 3.2597522403795464e-07, "logits/chosen": -0.3564254641532898, "logits/rejected": -0.71435546875, "logps/chosen": -414.3500061035156, "logps/rejected": -460.20001220703125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.321484327316284, "rewards/margins": 9.040624618530273, "rewards/rejected": -11.362500190734863, "step": 10230 }, { "epoch": 2.698998418555614, "grad_norm": 0.7316749272118511, "learning_rate": 3.253162888771745e-07, "logits/chosen": -0.2949462831020355, "logits/rejected": -0.645825207233429, "logps/chosen": -401.6000061035156, "logps/rejected": -460.8999938964844, "loss": 0.0113, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7249023914337158, "rewards/margins": 8.565625190734863, "rewards/rejected": -10.296875, "step": 10240 }, { "epoch": 2.701634159198735, "grad_norm": 1.3210931013614315, "learning_rate": 3.246573537163943e-07, "logits/chosen": -0.3380126953125, "logits/rejected": -0.6023925542831421, "logps/chosen": -375.29998779296875, "logps/rejected": -452.5, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.890039086341858, "rewards/margins": 9.024999618530273, "rewards/rejected": -10.925000190734863, "step": 10250 }, { "epoch": 2.7042698998418553, "grad_norm": 23.68876228038278, "learning_rate": 3.239984185556141e-07, "logits/chosen": -0.15958862006664276, "logits/rejected": -0.624316394329071, "logps/chosen": -430.3999938964844, "logps/rejected": -459.6000061035156, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.169921875, "rewards/margins": 8.423437118530273, "rewards/rejected": -10.592187881469727, "step": 10260 }, { "epoch": 2.706905640484976, "grad_norm": 158.86010998362073, "learning_rate": 3.233394833948339e-07, "logits/chosen": -0.48112601041793823, "logits/rejected": -0.581616222858429, "logps/chosen": -378.29998779296875, "logps/rejected": -448.1000061035156, "loss": 0.0149, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.360546827316284, "rewards/margins": 8.625, "rewards/rejected": -10.993749618530273, "step": 10270 }, { "epoch": 2.709541381128097, "grad_norm": 57.90832429840588, "learning_rate": 3.226805482340538e-07, "logits/chosen": -0.44089967012405396, "logits/rejected": -0.609240710735321, "logps/chosen": -404.1000061035156, "logps/rejected": -456.6000061035156, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -2.096874952316284, "rewards/margins": 8.978124618530273, "rewards/rejected": -11.078125, "step": 10280 }, { "epoch": 2.7121771217712176, "grad_norm": 11.600531442983733, "learning_rate": 3.220216130732736e-07, "logits/chosen": -0.40064698457717896, "logits/rejected": -0.7342468500137329, "logps/chosen": -392.95001220703125, "logps/rejected": -472.1000061035156, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -2.2138671875, "rewards/margins": 8.657812118530273, "rewards/rejected": -10.864062309265137, "step": 10290 }, { "epoch": 2.7148128624143384, "grad_norm": 0.34415959027617027, "learning_rate": 3.213626779124934e-07, "logits/chosen": -0.26862794160842896, "logits/rejected": -0.658447265625, "logps/chosen": -413.1000061035156, "logps/rejected": -448.79998779296875, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7170898914337158, "rewards/margins": 8.859375, "rewards/rejected": -10.578125, "step": 10300 }, { "epoch": 2.717448603057459, "grad_norm": 7.192452058146847, "learning_rate": 3.207037427517132e-07, "logits/chosen": -0.19333496689796448, "logits/rejected": -0.670166015625, "logps/chosen": -422.6000061035156, "logps/rejected": -425.8999938964844, "loss": 0.0508, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7502930164337158, "rewards/margins": 8.637499809265137, "rewards/rejected": -10.389062881469727, "step": 10310 }, { "epoch": 2.72008434370058, "grad_norm": 4.913477663719823, "learning_rate": 3.2004480759093303e-07, "logits/chosen": -0.2862304747104645, "logits/rejected": -0.576123058795929, "logps/chosen": -463.29998779296875, "logps/rejected": -515.7000122070312, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0531249046325684, "rewards/margins": 8.626562118530273, "rewards/rejected": -10.678125381469727, "step": 10320 }, { "epoch": 2.7227200843437007, "grad_norm": 7.144629114310936, "learning_rate": 3.193858724301529e-07, "logits/chosen": -0.3797607421875, "logits/rejected": -0.6722167730331421, "logps/chosen": -425.8999938964844, "logps/rejected": -455.6499938964844, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -1.67626953125, "rewards/margins": 8.598437309265137, "rewards/rejected": -10.278124809265137, "step": 10330 }, { "epoch": 2.7253558249868215, "grad_norm": 53.063560555171364, "learning_rate": 3.187269372693727e-07, "logits/chosen": -0.33390504121780396, "logits/rejected": -0.5847533941268921, "logps/chosen": -376.25, "logps/rejected": -415.5, "loss": 0.0246, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.055981397628784, "rewards/margins": 8.5, "rewards/rejected": -10.559374809265137, "step": 10340 }, { "epoch": 2.7279915656299423, "grad_norm": 20.885290001919984, "learning_rate": 3.1806800210859253e-07, "logits/chosen": -0.3025878965854645, "logits/rejected": -0.7181640863418579, "logps/chosen": -368.6499938964844, "logps/rejected": -400.1000061035156, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.433740258216858, "rewards/margins": 8.254687309265137, "rewards/rejected": -9.685937881469727, "step": 10350 }, { "epoch": 2.7306273062730626, "grad_norm": 0.7881089903856902, "learning_rate": 3.1740906694781233e-07, "logits/chosen": -0.3143554627895355, "logits/rejected": -0.7313476800918579, "logps/chosen": -400.3999938964844, "logps/rejected": -456.1000061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.3373045921325684, "rewards/margins": 8.473437309265137, "rewards/rejected": -10.8125, "step": 10360 }, { "epoch": 2.7332630469161834, "grad_norm": 2.2560369277468055, "learning_rate": 3.167501317870321e-07, "logits/chosen": -0.4440063536167145, "logits/rejected": -0.5253540277481079, "logps/chosen": -420.54998779296875, "logps/rejected": -444.79998779296875, "loss": 0.027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.861914038658142, "rewards/margins": 8.290624618530273, "rewards/rejected": -10.160937309265137, "step": 10370 }, { "epoch": 2.735898787559304, "grad_norm": 21.733998363150423, "learning_rate": 3.160911966262519e-07, "logits/chosen": -0.39689940214157104, "logits/rejected": -0.7446136474609375, "logps/chosen": -406.25, "logps/rejected": -434.79998779296875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -1.86328125, "rewards/margins": 8.528124809265137, "rewards/rejected": -10.384374618530273, "step": 10380 }, { "epoch": 2.738534528202425, "grad_norm": 4.716126270355852, "learning_rate": 3.154322614654718e-07, "logits/chosen": -0.457977294921875, "logits/rejected": -0.5570312738418579, "logps/chosen": -331.70001220703125, "logps/rejected": -428.1000061035156, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.9169189929962158, "rewards/margins": 8.232812881469727, "rewards/rejected": -10.157812118530273, "step": 10390 }, { "epoch": 2.7411702688455457, "grad_norm": 5.266153682823813, "learning_rate": 3.147733263046916e-07, "logits/chosen": -0.4608154296875, "logits/rejected": -0.5186523199081421, "logps/chosen": -428.25, "logps/rejected": -504.29998779296875, "loss": 0.0212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3900389671325684, "rewards/margins": 8.340624809265137, "rewards/rejected": -10.728124618530273, "step": 10400 }, { "epoch": 2.743806009488666, "grad_norm": 3.2602598884018352, "learning_rate": 3.141143911439114e-07, "logits/chosen": -0.46281129121780396, "logits/rejected": -0.54425048828125, "logps/chosen": -393.1499938964844, "logps/rejected": -458.20001220703125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.42138671875, "rewards/margins": 8.453125, "rewards/rejected": -9.875, "step": 10410 }, { "epoch": 2.746441750131787, "grad_norm": 3.80166485795218, "learning_rate": 3.134554559831312e-07, "logits/chosen": -0.3329101502895355, "logits/rejected": -0.5481628179550171, "logps/chosen": -404.25, "logps/rejected": -466.3999938964844, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.5652344226837158, "rewards/margins": 8.890625, "rewards/rejected": -10.453125, "step": 10420 }, { "epoch": 2.7490774907749076, "grad_norm": 21.753037943538004, "learning_rate": 3.1279652082235107e-07, "logits/chosen": -0.468017578125, "logits/rejected": -0.743945300579071, "logps/chosen": -405.20001220703125, "logps/rejected": -426.29998779296875, "loss": 0.0238, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4661133289337158, "rewards/margins": 8.776562690734863, "rewards/rejected": -10.234375, "step": 10430 }, { "epoch": 2.7517132314180284, "grad_norm": 10.018494055296598, "learning_rate": 3.1213758566157086e-07, "logits/chosen": -0.5040569305419922, "logits/rejected": -0.683398425579071, "logps/chosen": -356.25, "logps/rejected": -456.1000061035156, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -1.972509741783142, "rewards/margins": 9.2734375, "rewards/rejected": -11.237500190734863, "step": 10440 }, { "epoch": 2.754348972061149, "grad_norm": 8.748755042229215, "learning_rate": 3.114786505007907e-07, "logits/chosen": -0.3309570252895355, "logits/rejected": -0.585736095905304, "logps/chosen": -398.70001220703125, "logps/rejected": -481.1499938964844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.1089844703674316, "rewards/margins": 8.534375190734863, "rewards/rejected": -10.643750190734863, "step": 10450 }, { "epoch": 2.75698471270427, "grad_norm": 2.4013985525590655, "learning_rate": 3.108197153400105e-07, "logits/chosen": -0.40070801973342896, "logits/rejected": -0.6712280511856079, "logps/chosen": -380.95001220703125, "logps/rejected": -394.1000061035156, "loss": 0.0135, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.255078077316284, "rewards/margins": 8.032812118530273, "rewards/rejected": -10.293749809265137, "step": 10460 }, { "epoch": 2.7596204533473907, "grad_norm": 6.841267374051371, "learning_rate": 3.1016078017923036e-07, "logits/chosen": -0.44792479276657104, "logits/rejected": -0.635009765625, "logps/chosen": -342.70001220703125, "logps/rejected": -405.54998779296875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.181835889816284, "rewards/margins": 7.884375095367432, "rewards/rejected": -10.065625190734863, "step": 10470 }, { "epoch": 2.7622561939905115, "grad_norm": 183.5585074734584, "learning_rate": 3.0950184501845016e-07, "logits/chosen": -0.303314208984375, "logits/rejected": -0.6072677373886108, "logps/chosen": -358.29998779296875, "logps/rejected": -457.0, "loss": 0.023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5526854991912842, "rewards/margins": 8.564062118530273, "rewards/rejected": -10.112500190734863, "step": 10480 }, { "epoch": 2.7648919346336323, "grad_norm": 2.347902660720568, "learning_rate": 3.0884290985766996e-07, "logits/chosen": -0.32593995332717896, "logits/rejected": -0.5929321050643921, "logps/chosen": -373.79998779296875, "logps/rejected": -418.29998779296875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -2.2223143577575684, "rewards/margins": 8.4375, "rewards/rejected": -10.65625, "step": 10490 }, { "epoch": 2.767527675276753, "grad_norm": 7.705323288485831, "learning_rate": 3.0818397469688986e-07, "logits/chosen": -0.29145509004592896, "logits/rejected": -0.5645996332168579, "logps/chosen": -401.1000061035156, "logps/rejected": -454.70001220703125, "loss": 0.0156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.21826171875, "rewards/margins": 8.3984375, "rewards/rejected": -10.6171875, "step": 10500 }, { "epoch": 2.7701634159198734, "grad_norm": 9.513243997521638, "learning_rate": 3.0752503953610966e-07, "logits/chosen": -0.40478515625, "logits/rejected": -0.542431652545929, "logps/chosen": -383.45001220703125, "logps/rejected": -459.70001220703125, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -2.300976514816284, "rewards/margins": 8.317187309265137, "rewards/rejected": -10.615625381469727, "step": 10510 }, { "epoch": 2.772799156562994, "grad_norm": 14.953856220231033, "learning_rate": 3.0686610437532945e-07, "logits/chosen": -0.3559936583042145, "logits/rejected": -0.5167236328125, "logps/chosen": -401.95001220703125, "logps/rejected": -432.3999938964844, "loss": 0.0491, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.331835985183716, "rewards/margins": 7.918749809265137, "rewards/rejected": -10.2421875, "step": 10520 }, { "epoch": 2.775434897206115, "grad_norm": 55.651444715295426, "learning_rate": 3.0620716921454925e-07, "logits/chosen": -0.314828485250473, "logits/rejected": -0.649169921875, "logps/chosen": -406.1499938964844, "logps/rejected": -446.95001220703125, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.879003882408142, "rewards/margins": 8.618749618530273, "rewards/rejected": -10.495312690734863, "step": 10530 }, { "epoch": 2.7780706378492357, "grad_norm": 23.303622419531067, "learning_rate": 3.055482340537691e-07, "logits/chosen": -0.23292236030101776, "logits/rejected": -0.563507080078125, "logps/chosen": -407.8999938964844, "logps/rejected": -477.70001220703125, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -2.0746092796325684, "rewards/margins": 8.628125190734863, "rewards/rejected": -10.709375381469727, "step": 10540 }, { "epoch": 2.7807063784923565, "grad_norm": 0.5738670811379187, "learning_rate": 3.048892988929889e-07, "logits/chosen": -0.42052000761032104, "logits/rejected": -0.6217285394668579, "logps/chosen": -452.45001220703125, "logps/rejected": -489.8999938964844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.9988281726837158, "rewards/margins": 8.831250190734863, "rewards/rejected": -10.828125, "step": 10550 }, { "epoch": 2.783342119135477, "grad_norm": 2.4197177073844354, "learning_rate": 3.0423036373220875e-07, "logits/chosen": -0.24873046576976776, "logits/rejected": -0.48848265409469604, "logps/chosen": -430.29998779296875, "logps/rejected": -493.20001220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.8291015625, "rewards/margins": 8.910937309265137, "rewards/rejected": -10.743749618530273, "step": 10560 }, { "epoch": 2.7859778597785976, "grad_norm": 7.112681436871043, "learning_rate": 3.0357142857142855e-07, "logits/chosen": -0.40534669160842896, "logits/rejected": -0.6963866949081421, "logps/chosen": -415.25, "logps/rejected": -445.8999938964844, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -2.296008348464966, "rewards/margins": 8.725000381469727, "rewards/rejected": -11.024999618530273, "step": 10570 }, { "epoch": 2.7886136004217184, "grad_norm": 4.097798827725327, "learning_rate": 3.029124934106484e-07, "logits/chosen": -0.2543701231479645, "logits/rejected": -0.4609130918979645, "logps/chosen": -437.75, "logps/rejected": -457.79998779296875, "loss": 0.0414, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8731567859649658, "rewards/margins": 8.298437118530273, "rewards/rejected": -10.176562309265137, "step": 10580 }, { "epoch": 2.791249341064839, "grad_norm": 154.85924709497695, "learning_rate": 3.022535582498682e-07, "logits/chosen": -0.2822265625, "logits/rejected": -0.6807616949081421, "logps/chosen": -356.0, "logps/rejected": -437.1499938964844, "loss": 0.0289, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.394335985183716, "rewards/margins": 8.274999618530273, "rewards/rejected": -10.665624618530273, "step": 10590 }, { "epoch": 2.79388508170796, "grad_norm": 10.437938536034224, "learning_rate": 3.01594623089088e-07, "logits/chosen": -0.2512573301792145, "logits/rejected": -0.626269519329071, "logps/chosen": -410.6000061035156, "logps/rejected": -438.1000061035156, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.081310987472534, "rewards/margins": 8.526562690734863, "rewards/rejected": -10.615625381469727, "step": 10600 }, { "epoch": 2.7965208223510807, "grad_norm": 7.565449734530279, "learning_rate": 3.009356879283078e-07, "logits/chosen": -0.10745849460363388, "logits/rejected": -0.48649293184280396, "logps/chosen": -391.54998779296875, "logps/rejected": -432.6499938964844, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.092907667160034, "rewards/margins": 8.618749618530273, "rewards/rejected": -10.706250190734863, "step": 10610 }, { "epoch": 2.7991565629942015, "grad_norm": 1.300978237066513, "learning_rate": 3.002767527675277e-07, "logits/chosen": -0.4437255859375, "logits/rejected": -0.644299328327179, "logps/chosen": -395.54998779296875, "logps/rejected": -434.5, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0755858421325684, "rewards/margins": 8.846875190734863, "rewards/rejected": -10.925000190734863, "step": 10620 }, { "epoch": 2.8017923036373222, "grad_norm": 5.817595993442358, "learning_rate": 2.996178176067475e-07, "logits/chosen": -0.47382813692092896, "logits/rejected": -0.585540771484375, "logps/chosen": -429.70001220703125, "logps/rejected": -439.8999938964844, "loss": 0.0216, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3448243141174316, "rewards/margins": 8.415624618530273, "rewards/rejected": -10.762499809265137, "step": 10630 }, { "epoch": 2.804428044280443, "grad_norm": 11.924520464473543, "learning_rate": 2.989588824459673e-07, "logits/chosen": -0.23664550483226776, "logits/rejected": -0.512402355670929, "logps/chosen": -375.54998779296875, "logps/rejected": -430.5, "loss": 0.0087, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9921386241912842, "rewards/margins": 8.314062118530273, "rewards/rejected": -10.3125, "step": 10640 }, { "epoch": 2.807063784923564, "grad_norm": 2.5112095959160188, "learning_rate": 2.9829994728518714e-07, "logits/chosen": -0.2881050109863281, "logits/rejected": -0.5727783441543579, "logps/chosen": -414.70001220703125, "logps/rejected": -430.54998779296875, "loss": 0.0142, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7267577648162842, "rewards/margins": 8.606249809265137, "rewards/rejected": -10.334375381469727, "step": 10650 }, { "epoch": 2.809699525566684, "grad_norm": 0.9139475455174054, "learning_rate": 2.9764101212440693e-07, "logits/chosen": -0.36571043729782104, "logits/rejected": -0.45416259765625, "logps/chosen": -384.8500061035156, "logps/rejected": -440.3999938964844, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -1.653955101966858, "rewards/margins": 8.940625190734863, "rewards/rejected": -10.596875190734863, "step": 10660 }, { "epoch": 2.812335266209805, "grad_norm": 7.315220212571792, "learning_rate": 2.969820769636268e-07, "logits/chosen": -0.19980469346046448, "logits/rejected": -0.526660144329071, "logps/chosen": -387.25, "logps/rejected": -468.1499938964844, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -1.325537085533142, "rewards/margins": 8.699999809265137, "rewards/rejected": -10.035937309265137, "step": 10670 }, { "epoch": 2.8149710068529257, "grad_norm": 4.8585960903324255, "learning_rate": 2.963231418028466e-07, "logits/chosen": -0.39027100801467896, "logits/rejected": -0.6348022222518921, "logps/chosen": -352.0, "logps/rejected": -422.0, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2388672828674316, "rewards/margins": 8.278124809265137, "rewards/rejected": -10.512499809265137, "step": 10680 }, { "epoch": 2.8176067474960464, "grad_norm": 0.9521574344703351, "learning_rate": 2.9566420664206643e-07, "logits/chosen": -0.3150878846645355, "logits/rejected": -0.5855346918106079, "logps/chosen": -395.6499938964844, "logps/rejected": -445.6000061035156, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.01123046875, "rewards/margins": 8.735937118530273, "rewards/rejected": -10.75, "step": 10690 }, { "epoch": 2.8202424881391672, "grad_norm": 26.24325402300406, "learning_rate": 2.9500527148128623e-07, "logits/chosen": -0.32799071073532104, "logits/rejected": -0.595019519329071, "logps/chosen": -388.75, "logps/rejected": -440.20001220703125, "loss": 0.0137, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7759277820587158, "rewards/margins": 8.709375381469727, "rewards/rejected": -10.493749618530273, "step": 10700 }, { "epoch": 2.8228782287822876, "grad_norm": 47.52382714668284, "learning_rate": 2.94346336320506e-07, "logits/chosen": -0.3726844787597656, "logits/rejected": -0.6175903081893921, "logps/chosen": -410.5, "logps/rejected": -450.0, "loss": 0.0151, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.119732618331909, "rewards/margins": 8.837499618530273, "rewards/rejected": -10.953125, "step": 10710 }, { "epoch": 2.8255139694254083, "grad_norm": 7.257816319152626, "learning_rate": 2.936874011597258e-07, "logits/chosen": -0.305908203125, "logits/rejected": -0.7442382574081421, "logps/chosen": -407.8500061035156, "logps/rejected": -484.70001220703125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -2.4110350608825684, "rewards/margins": 8.514062881469727, "rewards/rejected": -10.928125381469727, "step": 10720 }, { "epoch": 2.828149710068529, "grad_norm": 10.297539437563405, "learning_rate": 2.9302846599894573e-07, "logits/chosen": -0.3865966796875, "logits/rejected": -0.7459961175918579, "logps/chosen": -366.45001220703125, "logps/rejected": -426.0, "loss": 0.0227, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4078125953674316, "rewards/margins": 8.510937690734863, "rewards/rejected": -10.920312881469727, "step": 10730 }, { "epoch": 2.83078545071165, "grad_norm": 2.8666633491481326, "learning_rate": 2.923695308381655e-07, "logits/chosen": -0.33979493379592896, "logits/rejected": -0.6944335699081421, "logps/chosen": -408.3500061035156, "logps/rejected": -419.29998779296875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -3.142578125, "rewards/margins": 8.451562881469727, "rewards/rejected": -11.596875190734863, "step": 10740 }, { "epoch": 2.8334211913547707, "grad_norm": 2.0374067622044993, "learning_rate": 2.917105956773853e-07, "logits/chosen": -0.49689942598342896, "logits/rejected": -0.6749267578125, "logps/chosen": -398.8500061035156, "logps/rejected": -495.6000061035156, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -2.791015625, "rewards/margins": 8.834375381469727, "rewards/rejected": -11.631250381469727, "step": 10750 }, { "epoch": 2.8360569319978914, "grad_norm": 3.8581734473005462, "learning_rate": 2.9105166051660517e-07, "logits/chosen": -0.3177490234375, "logits/rejected": -0.528124988079071, "logps/chosen": -399.0, "logps/rejected": -421.20001220703125, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.129101514816284, "rewards/margins": 8.223437309265137, "rewards/rejected": -10.359375, "step": 10760 }, { "epoch": 2.838692672641012, "grad_norm": 13.235487887157918, "learning_rate": 2.9039272535582497e-07, "logits/chosen": -0.45368653535842896, "logits/rejected": -0.504412829875946, "logps/chosen": -403.45001220703125, "logps/rejected": -473.5, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -2.2378907203674316, "rewards/margins": 8.625, "rewards/rejected": -10.868749618530273, "step": 10770 }, { "epoch": 2.841328413284133, "grad_norm": 9.539026308332817, "learning_rate": 2.8973379019504477e-07, "logits/chosen": -0.4114013612270355, "logits/rejected": -0.550097644329071, "logps/chosen": -412.6499938964844, "logps/rejected": -454.5, "loss": 0.0187, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.186718702316284, "rewards/margins": 8.65625, "rewards/rejected": -10.837499618530273, "step": 10780 }, { "epoch": 2.8439641539272538, "grad_norm": 5.95648596151114, "learning_rate": 2.890748550342646e-07, "logits/chosen": -0.5001159906387329, "logits/rejected": -0.617828369140625, "logps/chosen": -384.04998779296875, "logps/rejected": -460.29998779296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.150390625, "rewards/margins": 8.643750190734863, "rewards/rejected": -10.795312881469727, "step": 10790 }, { "epoch": 2.8465998945703745, "grad_norm": 9.498477978892135, "learning_rate": 2.8841591987348447e-07, "logits/chosen": -0.41769105195999146, "logits/rejected": -0.651293933391571, "logps/chosen": -373.20001220703125, "logps/rejected": -446.29998779296875, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2061524391174316, "rewards/margins": 8.537500381469727, "rewards/rejected": -10.743749618530273, "step": 10800 }, { "epoch": 2.849235635213495, "grad_norm": 70.53401666154187, "learning_rate": 2.8775698471270426e-07, "logits/chosen": -0.2705932557582855, "logits/rejected": -0.6552489995956421, "logps/chosen": -430.45001220703125, "logps/rejected": -454.70001220703125, "loss": 0.0136, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.060803174972534, "rewards/margins": 8.707812309265137, "rewards/rejected": -10.778124809265137, "step": 10810 }, { "epoch": 2.8518713758566157, "grad_norm": 15.503034773791327, "learning_rate": 2.8709804955192406e-07, "logits/chosen": -0.49261474609375, "logits/rejected": -0.6253906488418579, "logps/chosen": -412.3999938964844, "logps/rejected": -451.5, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.5703613758087158, "rewards/margins": 8.337499618530273, "rewards/rejected": -9.904687881469727, "step": 10820 }, { "epoch": 2.8545071164997364, "grad_norm": 32.8003794311376, "learning_rate": 2.8643911439114386e-07, "logits/chosen": -0.37592774629592896, "logits/rejected": -0.592822253704071, "logps/chosen": -375.29998779296875, "logps/rejected": -402.0, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -1.8114745616912842, "rewards/margins": 8.231249809265137, "rewards/rejected": -10.037500381469727, "step": 10830 }, { "epoch": 2.857142857142857, "grad_norm": 2.834561341666164, "learning_rate": 2.8578017923036376e-07, "logits/chosen": -0.23911742866039276, "logits/rejected": -0.550732433795929, "logps/chosen": -383.1000061035156, "logps/rejected": -449.8999938964844, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -1.3671875, "rewards/margins": 8.917187690734863, "rewards/rejected": -10.278124809265137, "step": 10840 }, { "epoch": 2.859778597785978, "grad_norm": 31.472917049824176, "learning_rate": 2.8512124406958356e-07, "logits/chosen": -0.3090934753417969, "logits/rejected": -0.5634094476699829, "logps/chosen": -418.1000061035156, "logps/rejected": -472.3999938964844, "loss": 0.0135, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.521582007408142, "rewards/margins": 8.301562309265137, "rewards/rejected": -9.824999809265137, "step": 10850 }, { "epoch": 2.8624143384290983, "grad_norm": 14.22141759338623, "learning_rate": 2.8446230890880336e-07, "logits/chosen": -0.17085877060890198, "logits/rejected": -0.541577160358429, "logps/chosen": -350.3999938964844, "logps/rejected": -461.5, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.715356469154358, "rewards/margins": 8.690625190734863, "rewards/rejected": -10.407812118530273, "step": 10860 }, { "epoch": 2.865050079072219, "grad_norm": 34.47245676471108, "learning_rate": 2.8380337374802315e-07, "logits/chosen": -0.45032960176467896, "logits/rejected": -0.532543957233429, "logps/chosen": -414.25, "logps/rejected": -473.29998779296875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -2.062695264816284, "rewards/margins": 8.862500190734863, "rewards/rejected": -10.928125381469727, "step": 10870 }, { "epoch": 2.86768581971534, "grad_norm": 25.666897209389198, "learning_rate": 2.83144438587243e-07, "logits/chosen": -0.4480224549770355, "logits/rejected": -0.579394519329071, "logps/chosen": -392.29998779296875, "logps/rejected": -423.79998779296875, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.089550733566284, "rewards/margins": 8.190625190734863, "rewards/rejected": -10.274999618530273, "step": 10880 }, { "epoch": 2.8703215603584606, "grad_norm": 11.586150895945096, "learning_rate": 2.824855034264628e-07, "logits/chosen": -0.41845703125, "logits/rejected": -0.511767566204071, "logps/chosen": -380.6499938964844, "logps/rejected": -435.5, "loss": 0.0329, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0628905296325684, "rewards/margins": 8.181249618530273, "rewards/rejected": -10.254687309265137, "step": 10890 }, { "epoch": 2.8729573010015814, "grad_norm": 6.195190486267771, "learning_rate": 2.8182656826568265e-07, "logits/chosen": -0.6620849370956421, "logits/rejected": -0.7267211675643921, "logps/chosen": -404.5, "logps/rejected": -456.8999938964844, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -2.1953125, "rewards/margins": 8.649999618530273, "rewards/rejected": -10.840624809265137, "step": 10900 }, { "epoch": 2.875593041644702, "grad_norm": 1.2752508478109654, "learning_rate": 2.811676331049025e-07, "logits/chosen": -0.30424803495407104, "logits/rejected": -0.7740234136581421, "logps/chosen": -445.6000061035156, "logps/rejected": -469.29998779296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.703515648841858, "rewards/margins": 9.239062309265137, "rewards/rejected": -10.934374809265137, "step": 10910 }, { "epoch": 2.878228782287823, "grad_norm": 3.691497076468412, "learning_rate": 2.805086979441223e-07, "logits/chosen": -0.33781737089157104, "logits/rejected": -0.6516357660293579, "logps/chosen": -423.1499938964844, "logps/rejected": -450.45001220703125, "loss": 0.0142, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5747802257537842, "rewards/margins": 8.860937118530273, "rewards/rejected": -10.440625190734863, "step": 10920 }, { "epoch": 2.8808645229309437, "grad_norm": 3.8794272770577507, "learning_rate": 2.798497627833421e-07, "logits/chosen": -0.2871643006801605, "logits/rejected": -0.6632324457168579, "logps/chosen": -381.54998779296875, "logps/rejected": -416.1499938964844, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -1.1396484375, "rewards/margins": 8.620312690734863, "rewards/rejected": -9.768750190734863, "step": 10930 }, { "epoch": 2.8835002635740645, "grad_norm": 9.205175085583388, "learning_rate": 2.791908276225619e-07, "logits/chosen": -0.46071165800094604, "logits/rejected": -0.624835193157196, "logps/chosen": -405.1499938964844, "logps/rejected": -454.79998779296875, "loss": 0.0269, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.38330078125, "rewards/margins": 8.4375, "rewards/rejected": -9.821874618530273, "step": 10940 }, { "epoch": 2.8861360042171853, "grad_norm": 10.050562633200586, "learning_rate": 2.7853189246178174e-07, "logits/chosen": -0.3630615174770355, "logits/rejected": -0.6254364252090454, "logps/chosen": -405.6000061035156, "logps/rejected": -459.8999938964844, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -1.49951171875, "rewards/margins": 8.496874809265137, "rewards/rejected": -9.995312690734863, "step": 10950 }, { "epoch": 2.8887717448603056, "grad_norm": 1.7391773692603294, "learning_rate": 2.778729573010016e-07, "logits/chosen": -0.47309571504592896, "logits/rejected": -0.573107898235321, "logps/chosen": -378.1499938964844, "logps/rejected": -429.3999938964844, "loss": 0.0401, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5812499523162842, "rewards/margins": 7.9375, "rewards/rejected": -9.524999618530273, "step": 10960 }, { "epoch": 2.8914074855034264, "grad_norm": 1.1285758323929969, "learning_rate": 2.772140221402214e-07, "logits/chosen": -0.3104248046875, "logits/rejected": -0.630175769329071, "logps/chosen": -426.0, "logps/rejected": -453.04998779296875, "loss": 0.0114, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.017285108566284, "rewards/margins": 8.565625190734863, "rewards/rejected": -10.589062690734863, "step": 10970 }, { "epoch": 2.894043226146547, "grad_norm": 9.635883316147153, "learning_rate": 2.765550869794412e-07, "logits/chosen": -0.18995361030101776, "logits/rejected": -0.677624523639679, "logps/chosen": -371.57501220703125, "logps/rejected": -419.3999938964844, "loss": 0.0195, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3197264671325684, "rewards/margins": 8.068750381469727, "rewards/rejected": -10.399999618530273, "step": 10980 }, { "epoch": 2.896678966789668, "grad_norm": 2.4212615048991895, "learning_rate": 2.7589615181866104e-07, "logits/chosen": -0.23393554985523224, "logits/rejected": -0.5702148675918579, "logps/chosen": -386.45001220703125, "logps/rejected": -459.29998779296875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.9643065929412842, "rewards/margins": 8.581250190734863, "rewards/rejected": -10.553125381469727, "step": 10990 }, { "epoch": 2.8993147074327887, "grad_norm": 12.390536995084881, "learning_rate": 2.7523721665788084e-07, "logits/chosen": -0.36503297090530396, "logits/rejected": -0.71826171875, "logps/chosen": -389.04998779296875, "logps/rejected": -421.75, "loss": 0.033, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.3970704078674316, "rewards/margins": 8.740625381469727, "rewards/rejected": -11.131250381469727, "step": 11000 }, { "epoch": 2.901950448075909, "grad_norm": 0.8563136116393749, "learning_rate": 2.745782814971007e-07, "logits/chosen": -0.49609375, "logits/rejected": -0.685009777545929, "logps/chosen": -377.20001220703125, "logps/rejected": -443.5, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.022631883621216, "rewards/margins": 8.995312690734863, "rewards/rejected": -11.020312309265137, "step": 11010 }, { "epoch": 2.90458618871903, "grad_norm": 1.7817161416773757, "learning_rate": 2.739193463363205e-07, "logits/chosen": -0.5060790777206421, "logits/rejected": -0.6265503168106079, "logps/chosen": -368.1000061035156, "logps/rejected": -442.5, "loss": 0.0406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.21435546875, "rewards/margins": 8.856249809265137, "rewards/rejected": -11.068750381469727, "step": 11020 }, { "epoch": 2.9072219293621506, "grad_norm": 16.96831376703409, "learning_rate": 2.7326041117554033e-07, "logits/chosen": -0.32258301973342896, "logits/rejected": -0.737255871295929, "logps/chosen": -416.3500061035156, "logps/rejected": -442.0, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -2.318359375, "rewards/margins": 8.96875, "rewards/rejected": -11.293749809265137, "step": 11030 }, { "epoch": 2.9098576700052714, "grad_norm": 3.410599481262814, "learning_rate": 2.7260147601476013e-07, "logits/chosen": -0.3899902403354645, "logits/rejected": -0.6245635747909546, "logps/chosen": -399.29998779296875, "logps/rejected": -443.0, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.40283203125, "rewards/margins": 8.896875381469727, "rewards/rejected": -11.300000190734863, "step": 11040 }, { "epoch": 2.912493410648392, "grad_norm": 24.768387860210048, "learning_rate": 2.7194254085397993e-07, "logits/chosen": -0.354248046875, "logits/rejected": -0.707470715045929, "logps/chosen": -392.70001220703125, "logps/rejected": -467.3999938964844, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -2.8404297828674316, "rewards/margins": 8.698437690734863, "rewards/rejected": -11.537500381469727, "step": 11050 }, { "epoch": 2.915129151291513, "grad_norm": 19.605841482991774, "learning_rate": 2.712836056931998e-07, "logits/chosen": -0.530712902545929, "logits/rejected": -0.683667004108429, "logps/chosen": -428.20001220703125, "logps/rejected": -515.4000244140625, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -2.62109375, "rewards/margins": 9.1328125, "rewards/rejected": -11.753125190734863, "step": 11060 }, { "epoch": 2.9177648919346337, "grad_norm": 1.803013985057808, "learning_rate": 2.7062467053241963e-07, "logits/chosen": -0.3006591796875, "logits/rejected": -0.6844238042831421, "logps/chosen": -397.8999938964844, "logps/rejected": -453.29998779296875, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.785937547683716, "rewards/margins": 8.856249809265137, "rewards/rejected": -11.640625, "step": 11070 }, { "epoch": 2.9204006325777545, "grad_norm": 2.3574053933724173, "learning_rate": 2.699657353716394e-07, "logits/chosen": -0.4698730409145355, "logits/rejected": -0.610919177532196, "logps/chosen": -378.45001220703125, "logps/rejected": -448.1499938964844, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.986132860183716, "rewards/margins": 8.640625, "rewards/rejected": -11.621874809265137, "step": 11080 }, { "epoch": 2.9230363732208753, "grad_norm": 15.985078421418054, "learning_rate": 2.693068002108592e-07, "logits/chosen": -0.18175049126148224, "logits/rejected": -0.6732422113418579, "logps/chosen": -361.20001220703125, "logps/rejected": -422.70001220703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.662890672683716, "rewards/margins": 8.457812309265137, "rewards/rejected": -11.112500190734863, "step": 11090 }, { "epoch": 2.925672113863996, "grad_norm": 32.645696445365466, "learning_rate": 2.686478650500791e-07, "logits/chosen": -0.37434083223342896, "logits/rejected": -0.7342284917831421, "logps/chosen": -371.2749938964844, "logps/rejected": -437.79998779296875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.6087889671325684, "rewards/margins": 8.793749809265137, "rewards/rejected": -11.403124809265137, "step": 11100 }, { "epoch": 2.9283078545071164, "grad_norm": 3.3424720964492356, "learning_rate": 2.6798892988929887e-07, "logits/chosen": -0.5775054693222046, "logits/rejected": -0.72802734375, "logps/chosen": -424.8999938964844, "logps/rejected": -476.5, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -2.6937499046325684, "rewards/margins": 8.6875, "rewards/rejected": -11.378125190734863, "step": 11110 }, { "epoch": 2.930943595150237, "grad_norm": 4.606690729651326, "learning_rate": 2.6732999472851867e-07, "logits/chosen": -0.3012329041957855, "logits/rejected": -0.6993468999862671, "logps/chosen": -379.70001220703125, "logps/rejected": -400.0, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2523436546325684, "rewards/margins": 8.399999618530273, "rewards/rejected": -10.640625, "step": 11120 }, { "epoch": 2.933579335793358, "grad_norm": 16.7024761444194, "learning_rate": 2.666710595677385e-07, "logits/chosen": -0.523425281047821, "logits/rejected": -0.66162109375, "logps/chosen": -390.6499938964844, "logps/rejected": -437.1000061035156, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.484570264816284, "rewards/margins": 8.387499809265137, "rewards/rejected": -10.868749618530273, "step": 11130 }, { "epoch": 2.9362150764364787, "grad_norm": 2.15834689627536, "learning_rate": 2.6601212440695837e-07, "logits/chosen": -0.3006347715854645, "logits/rejected": -0.575390636920929, "logps/chosen": -422.70001220703125, "logps/rejected": -457.5, "loss": 0.0202, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2474608421325684, "rewards/margins": 8.296875, "rewards/rejected": -10.550000190734863, "step": 11140 }, { "epoch": 2.9388508170795995, "grad_norm": 6.418115107183829, "learning_rate": 2.6535318924617817e-07, "logits/chosen": -0.44503480195999146, "logits/rejected": -0.6539062261581421, "logps/chosen": -402.3999938964844, "logps/rejected": -452.5, "loss": 0.0257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.132519483566284, "rewards/margins": 8.598437309265137, "rewards/rejected": -10.731249809265137, "step": 11150 }, { "epoch": 2.94148655772272, "grad_norm": 3.6125371927930168, "learning_rate": 2.6469425408539796e-07, "logits/chosen": -0.3610595762729645, "logits/rejected": -0.650561511516571, "logps/chosen": -439.3500061035156, "logps/rejected": -465.20001220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.874121069908142, "rewards/margins": 9.018750190734863, "rewards/rejected": -10.899999618530273, "step": 11160 }, { "epoch": 2.9441222983658406, "grad_norm": 54.293655596999834, "learning_rate": 2.6403531892461776e-07, "logits/chosen": -0.47893065214157104, "logits/rejected": -0.73291015625, "logps/chosen": -377.45001220703125, "logps/rejected": -428.04998779296875, "loss": 0.0323, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.300976514816284, "rewards/margins": 8.326562881469727, "rewards/rejected": -10.621874809265137, "step": 11170 }, { "epoch": 2.9467580390089614, "grad_norm": 0.5941697153358607, "learning_rate": 2.6337638376383766e-07, "logits/chosen": -0.34638673067092896, "logits/rejected": -0.723071277141571, "logps/chosen": -369.54998779296875, "logps/rejected": -436.8999938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.05859375, "rewards/margins": 8.131250381469727, "rewards/rejected": -10.189062118530273, "step": 11180 }, { "epoch": 2.949393779652082, "grad_norm": 2.316551541626966, "learning_rate": 2.6271744860305746e-07, "logits/chosen": -0.41032713651657104, "logits/rejected": -0.6218017339706421, "logps/chosen": -420.95001220703125, "logps/rejected": -480.20001220703125, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.972265601158142, "rewards/margins": 9.248437881469727, "rewards/rejected": -11.225000381469727, "step": 11190 }, { "epoch": 2.952029520295203, "grad_norm": 46.07533602548045, "learning_rate": 2.6205851344227726e-07, "logits/chosen": -0.3286499083042145, "logits/rejected": -0.620190441608429, "logps/chosen": -413.79998779296875, "logps/rejected": -446.8999938964844, "loss": 0.0183, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8876953125, "rewards/margins": 8.734375, "rewards/rejected": -10.628125190734863, "step": 11200 }, { "epoch": 2.9546652609383237, "grad_norm": 22.28398949132381, "learning_rate": 2.613995782814971e-07, "logits/chosen": -0.31201171875, "logits/rejected": -0.5964111089706421, "logps/chosen": -411.79998779296875, "logps/rejected": -446.20001220703125, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.0234375, "rewards/margins": 8.395312309265137, "rewards/rejected": -10.417187690734863, "step": 11210 }, { "epoch": 2.9573010015814445, "grad_norm": 9.785835978299113, "learning_rate": 2.607406431207169e-07, "logits/chosen": -0.45634156465530396, "logits/rejected": -0.744946300983429, "logps/chosen": -386.04998779296875, "logps/rejected": -475.70001220703125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -2.383593797683716, "rewards/margins": 8.989062309265137, "rewards/rejected": -11.365625381469727, "step": 11220 }, { "epoch": 2.9599367422245653, "grad_norm": 2.3441807544943374, "learning_rate": 2.600817079599367e-07, "logits/chosen": -0.5838378667831421, "logits/rejected": -0.7700439691543579, "logps/chosen": -371.1499938964844, "logps/rejected": -404.25, "loss": 0.012, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.811718702316284, "rewards/margins": 8.056249618530273, "rewards/rejected": -10.857812881469727, "step": 11230 }, { "epoch": 2.962572482867686, "grad_norm": 24.89236598843499, "learning_rate": 2.5942277279915655e-07, "logits/chosen": -0.3010009825229645, "logits/rejected": -0.692614734172821, "logps/chosen": -404.20001220703125, "logps/rejected": -440.20001220703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -2.406445264816284, "rewards/margins": 8.526562690734863, "rewards/rejected": -10.935937881469727, "step": 11240 }, { "epoch": 2.9652082235108064, "grad_norm": 5.750977101033848, "learning_rate": 2.587638376383764e-07, "logits/chosen": -0.372314453125, "logits/rejected": -0.669079601764679, "logps/chosen": -386.95001220703125, "logps/rejected": -436.8999938964844, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.373046875, "rewards/margins": 9.081250190734863, "rewards/rejected": -11.449999809265137, "step": 11250 }, { "epoch": 2.967843964153927, "grad_norm": 6.5418799908198455, "learning_rate": 2.581049024775962e-07, "logits/chosen": -0.40120238065719604, "logits/rejected": -0.6752685308456421, "logps/chosen": -429.25, "logps/rejected": -491.79998779296875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -1.6229979991912842, "rewards/margins": 9.332812309265137, "rewards/rejected": -10.956250190734863, "step": 11260 }, { "epoch": 2.970479704797048, "grad_norm": 0.8531176695756885, "learning_rate": 2.57445967316816e-07, "logits/chosen": -0.3116455078125, "logits/rejected": -0.595654308795929, "logps/chosen": -432.0, "logps/rejected": -470.70001220703125, "loss": 0.0356, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.80517578125, "rewards/margins": 8.300000190734863, "rewards/rejected": -10.100000381469727, "step": 11270 }, { "epoch": 2.9731154454401687, "grad_norm": 3.1023907995461255, "learning_rate": 2.567870321560358e-07, "logits/chosen": -0.54473876953125, "logits/rejected": -0.847900390625, "logps/chosen": -396.29998779296875, "logps/rejected": -454.8999938964844, "loss": 0.0181, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1877074241638184, "rewards/margins": 8.481249809265137, "rewards/rejected": -10.665624618530273, "step": 11280 }, { "epoch": 2.9757511860832895, "grad_norm": 1.6165694776925765, "learning_rate": 2.5612809699525565e-07, "logits/chosen": -0.32196044921875, "logits/rejected": -0.5546935796737671, "logps/chosen": -361.95001220703125, "logps/rejected": -389.8999938964844, "loss": 0.0128, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6708984375, "rewards/margins": 8.284375190734863, "rewards/rejected": -9.954687118530273, "step": 11290 }, { "epoch": 2.9783869267264103, "grad_norm": 0.7248170227121348, "learning_rate": 2.554691618344755e-07, "logits/chosen": -0.3988708555698395, "logits/rejected": -0.617016613483429, "logps/chosen": -382.25, "logps/rejected": -422.25, "loss": 0.0108, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.25830078125, "rewards/margins": 8.231249809265137, "rewards/rejected": -10.489062309265137, "step": 11300 }, { "epoch": 2.9810226673695306, "grad_norm": 24.960560522067073, "learning_rate": 2.548102266736953e-07, "logits/chosen": -0.44386595487594604, "logits/rejected": -0.6007324457168579, "logps/chosen": -374.04998779296875, "logps/rejected": -409.70001220703125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.0484862327575684, "rewards/margins": 8.28125, "rewards/rejected": -10.332812309265137, "step": 11310 }, { "epoch": 2.9836584080126514, "grad_norm": 10.584944688272408, "learning_rate": 2.5415129151291514e-07, "logits/chosen": -0.3937011659145355, "logits/rejected": -0.6925293207168579, "logps/chosen": -388.54998779296875, "logps/rejected": -461.29998779296875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.050976514816284, "rewards/margins": 8.404687881469727, "rewards/rejected": -10.456250190734863, "step": 11320 }, { "epoch": 2.986294148655772, "grad_norm": 15.963951375563978, "learning_rate": 2.5349235635213494e-07, "logits/chosen": -0.21590575575828552, "logits/rejected": -0.5583862066268921, "logps/chosen": -411.8999938964844, "logps/rejected": -482.3999938964844, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.850488305091858, "rewards/margins": 8.965624809265137, "rewards/rejected": -10.815625190734863, "step": 11330 }, { "epoch": 2.988929889298893, "grad_norm": 1.7040690979847162, "learning_rate": 2.5283342119135474e-07, "logits/chosen": -0.557080090045929, "logits/rejected": -0.650195300579071, "logps/chosen": -382.04998779296875, "logps/rejected": -463.8999938964844, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -2.5634765625, "rewards/margins": 8.675000190734863, "rewards/rejected": -11.237500190734863, "step": 11340 }, { "epoch": 2.9915656299420137, "grad_norm": 2.7951595012033223, "learning_rate": 2.521744860305746e-07, "logits/chosen": -0.4208984375, "logits/rejected": -0.634692370891571, "logps/chosen": -384.1000061035156, "logps/rejected": -412.29998779296875, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -1.791601538658142, "rewards/margins": 8.642187118530273, "rewards/rejected": -10.432812690734863, "step": 11350 }, { "epoch": 2.9942013705851345, "grad_norm": 5.533372047637642, "learning_rate": 2.5151555086979444e-07, "logits/chosen": -0.34388428926467896, "logits/rejected": -0.6145874261856079, "logps/chosen": -379.3999938964844, "logps/rejected": -432.1000061035156, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -2.5550780296325684, "rewards/margins": 8.765625, "rewards/rejected": -11.324999809265137, "step": 11360 }, { "epoch": 2.9968371112282552, "grad_norm": 0.9499361579690516, "learning_rate": 2.5085661570901424e-07, "logits/chosen": -0.35560303926467896, "logits/rejected": -0.760937511920929, "logps/chosen": -425.8500061035156, "logps/rejected": -471.8999938964844, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -2.771679639816284, "rewards/margins": 9.028124809265137, "rewards/rejected": -11.800000190734863, "step": 11370 }, { "epoch": 2.999472851871376, "grad_norm": 1.7122448578094753, "learning_rate": 2.5019768054823403e-07, "logits/chosen": -0.21651192009449005, "logits/rejected": -0.594433605670929, "logps/chosen": -387.25, "logps/rejected": -457.04998779296875, "loss": 0.017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.783398389816284, "rewards/margins": 9.578125, "rewards/rejected": -12.3671875, "step": 11380 }, { "epoch": 3.0021085925144964, "grad_norm": 0.481325657242847, "learning_rate": 2.495387453874539e-07, "logits/chosen": -0.30820924043655396, "logits/rejected": -0.629638671875, "logps/chosen": -459.0, "logps/rejected": -504.5, "loss": 0.0112, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.474804639816284, "rewards/margins": 9.609375, "rewards/rejected": -12.074999809265137, "step": 11390 }, { "epoch": 3.004744333157617, "grad_norm": 0.0986085626757933, "learning_rate": 2.488798102266737e-07, "logits/chosen": -0.25620728731155396, "logits/rejected": -0.747363269329071, "logps/chosen": -407.1499938964844, "logps/rejected": -440.6000061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.5746092796325684, "rewards/margins": 9.904687881469727, "rewards/rejected": -12.475000381469727, "step": 11400 }, { "epoch": 3.007380073800738, "grad_norm": 0.37595053902785686, "learning_rate": 2.4822087506589353e-07, "logits/chosen": -0.3888916075229645, "logits/rejected": -0.7080078125, "logps/chosen": -396.70001220703125, "logps/rejected": -447.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.4808592796325684, "rewards/margins": 9.609375, "rewards/rejected": -12.087499618530273, "step": 11410 }, { "epoch": 3.0100158144438587, "grad_norm": 0.39851004321349, "learning_rate": 2.4756193990511333e-07, "logits/chosen": -0.1925048828125, "logits/rejected": -0.601318359375, "logps/chosen": -405.04998779296875, "logps/rejected": -464.0, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.3785157203674316, "rewards/margins": 9.387499809265137, "rewards/rejected": -11.774999618530273, "step": 11420 }, { "epoch": 3.0126515550869795, "grad_norm": 0.4726133213449336, "learning_rate": 2.469030047443331e-07, "logits/chosen": -0.21617431938648224, "logits/rejected": -0.649151623249054, "logps/chosen": -377.0, "logps/rejected": -484.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.885351538658142, "rewards/margins": 10.071874618530273, "rewards/rejected": -11.953125, "step": 11430 }, { "epoch": 3.0152872957301002, "grad_norm": 1.653129764689825, "learning_rate": 2.46244069583553e-07, "logits/chosen": -0.41307371854782104, "logits/rejected": -0.67041015625, "logps/chosen": -392.25, "logps/rejected": -444.1000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.4424805641174316, "rewards/margins": 10.390625, "rewards/rejected": -12.837499618530273, "step": 11440 }, { "epoch": 3.017923036373221, "grad_norm": 0.34478402876375824, "learning_rate": 2.455851344227728e-07, "logits/chosen": -0.30595701932907104, "logits/rejected": -0.6744140386581421, "logps/chosen": -370.0, "logps/rejected": -457.1000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.8173828125, "rewards/margins": 10.053125381469727, "rewards/rejected": -12.878125190734863, "step": 11450 }, { "epoch": 3.020558777016342, "grad_norm": 0.3854901267644998, "learning_rate": 2.449261992619926e-07, "logits/chosen": -0.48328858613967896, "logits/rejected": -0.76611328125, "logps/chosen": -407.70001220703125, "logps/rejected": -495.5, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.4292969703674316, "rewards/margins": 10.574999809265137, "rewards/rejected": -13.009374618530273, "step": 11460 }, { "epoch": 3.023194517659462, "grad_norm": 1.6394498940062956, "learning_rate": 2.442672641012124e-07, "logits/chosen": -0.4731201231479645, "logits/rejected": -0.698535144329071, "logps/chosen": -435.8999938964844, "logps/rejected": -519.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.6634764671325684, "rewards/margins": 10.071874618530273, "rewards/rejected": -12.721875190734863, "step": 11470 }, { "epoch": 3.025830258302583, "grad_norm": 0.5197214591576157, "learning_rate": 2.4360832894043227e-07, "logits/chosen": -0.3973327577114105, "logits/rejected": -0.7811523675918579, "logps/chosen": -397.5, "logps/rejected": -482.1000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.189453125, "rewards/margins": 9.925000190734863, "rewards/rejected": -12.112500190734863, "step": 11480 }, { "epoch": 3.0284659989457037, "grad_norm": 0.37862462679697484, "learning_rate": 2.4294939377965207e-07, "logits/chosen": -0.48161619901657104, "logits/rejected": -0.703662097454071, "logps/chosen": -399.6499938964844, "logps/rejected": -465.1000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.4969725608825684, "rewards/margins": 10.203125, "rewards/rejected": -12.709375381469727, "step": 11490 }, { "epoch": 3.0311017395888245, "grad_norm": 0.941285927825607, "learning_rate": 2.422904586188719e-07, "logits/chosen": -0.4520507752895355, "logits/rejected": -0.822460949420929, "logps/chosen": -376.54998779296875, "logps/rejected": -469.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.594921827316284, "rewards/margins": 10.321874618530273, "rewards/rejected": -12.912500381469727, "step": 11500 }, { "epoch": 3.0337374802319452, "grad_norm": 0.40230760742549077, "learning_rate": 2.416315234580917e-07, "logits/chosen": -0.584521472454071, "logits/rejected": -0.829907238483429, "logps/chosen": -348.20001220703125, "logps/rejected": -414.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.625195264816284, "rewards/margins": 9.574999809265137, "rewards/rejected": -12.203125, "step": 11510 }, { "epoch": 3.036373220875066, "grad_norm": 0.10639966610756746, "learning_rate": 2.4097258829731157e-07, "logits/chosen": -0.3734130859375, "logits/rejected": -0.836865246295929, "logps/chosen": -418.79998779296875, "logps/rejected": -476.1000061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.379150390625, "rewards/margins": 10.128125190734863, "rewards/rejected": -12.503125190734863, "step": 11520 }, { "epoch": 3.039008961518187, "grad_norm": 0.30472624840410273, "learning_rate": 2.4031365313653136e-07, "logits/chosen": -0.5789550542831421, "logits/rejected": -0.8363037109375, "logps/chosen": -392.5, "logps/rejected": -449.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.775195360183716, "rewards/margins": 10.095312118530273, "rewards/rejected": -12.865625381469727, "step": 11530 }, { "epoch": 3.041644702161307, "grad_norm": 0.6307916291875445, "learning_rate": 2.3965471797575116e-07, "logits/chosen": -0.304443359375, "logits/rejected": -0.696148693561554, "logps/chosen": -405.20001220703125, "logps/rejected": -437.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4122557640075684, "rewards/margins": 9.896875381469727, "rewards/rejected": -12.306249618530273, "step": 11540 }, { "epoch": 3.044280442804428, "grad_norm": 0.46452772424480876, "learning_rate": 2.38995782814971e-07, "logits/chosen": -0.5618896484375, "logits/rejected": -0.80401611328125, "logps/chosen": -374.1000061035156, "logps/rejected": -460.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.540332078933716, "rewards/margins": 10.615625381469727, "rewards/rejected": -13.159375190734863, "step": 11550 }, { "epoch": 3.0469161834475487, "grad_norm": 0.9050054657339022, "learning_rate": 2.383368476541908e-07, "logits/chosen": -0.4337615966796875, "logits/rejected": -0.7468506097793579, "logps/chosen": -405.3500061035156, "logps/rejected": -464.1000061035156, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.5859375, "rewards/margins": 10.073437690734863, "rewards/rejected": -12.668749809265137, "step": 11560 }, { "epoch": 3.0495519240906694, "grad_norm": 0.7812872967114554, "learning_rate": 2.3767791249341063e-07, "logits/chosen": -0.548779308795929, "logits/rejected": -0.735888659954071, "logps/chosen": -397.6499938964844, "logps/rejected": -479.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4698729515075684, "rewards/margins": 10.5078125, "rewards/rejected": -12.978124618530273, "step": 11570 }, { "epoch": 3.05218766473379, "grad_norm": 0.2773354605685119, "learning_rate": 2.3701897733263046e-07, "logits/chosen": -0.5202881097793579, "logits/rejected": -0.812304675579071, "logps/chosen": -382.5, "logps/rejected": -445.6000061035156, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.467968702316284, "rewards/margins": 9.909375190734863, "rewards/rejected": -12.375, "step": 11580 }, { "epoch": 3.054823405376911, "grad_norm": 0.9483719330047058, "learning_rate": 2.3636004217185025e-07, "logits/chosen": -0.47875672578811646, "logits/rejected": -0.746142566204071, "logps/chosen": -365.1000061035156, "logps/rejected": -475.8999938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.588085889816284, "rewards/margins": 10.3671875, "rewards/rejected": -12.953125, "step": 11590 }, { "epoch": 3.0574591460200318, "grad_norm": 0.09000695730829465, "learning_rate": 2.357011070110701e-07, "logits/chosen": -0.477294921875, "logits/rejected": -0.860583484172821, "logps/chosen": -388.20001220703125, "logps/rejected": -495.29998779296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.4722657203674316, "rewards/margins": 10.600000381469727, "rewards/rejected": -14.084375381469727, "step": 11600 }, { "epoch": 3.0600948866631525, "grad_norm": 4.850657077633242, "learning_rate": 2.3504217185028993e-07, "logits/chosen": -0.4613037109375, "logits/rejected": -0.6359909176826477, "logps/chosen": -433.3500061035156, "logps/rejected": -497.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.9234375953674316, "rewards/margins": 10.881250381469727, "rewards/rejected": -13.800000190734863, "step": 11610 }, { "epoch": 3.062730627306273, "grad_norm": 1.1839080182487878, "learning_rate": 2.3438323668950975e-07, "logits/chosen": -0.66650390625, "logits/rejected": -0.803906261920929, "logps/chosen": -398.8999938964844, "logps/rejected": -471.8999938964844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.153515577316284, "rewards/margins": 10.353124618530273, "rewards/rejected": -13.496874809265137, "step": 11620 }, { "epoch": 3.0653663679493937, "grad_norm": 1.7438329093574176, "learning_rate": 2.3372430152872957e-07, "logits/chosen": -0.3804565370082855, "logits/rejected": -0.5502685308456421, "logps/chosen": -388.8999938964844, "logps/rejected": -471.70001220703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.8681640625, "rewards/margins": 9.84375, "rewards/rejected": -12.715624809265137, "step": 11630 }, { "epoch": 3.0680021085925144, "grad_norm": 0.2954403673144411, "learning_rate": 2.3306536636794937e-07, "logits/chosen": -0.43498533964157104, "logits/rejected": -0.7801269292831421, "logps/chosen": -381.20001220703125, "logps/rejected": -476.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9585938453674316, "rewards/margins": 10.434374809265137, "rewards/rejected": -13.393750190734863, "step": 11640 }, { "epoch": 3.070637849235635, "grad_norm": 0.9164260874898018, "learning_rate": 2.3240643120716922e-07, "logits/chosen": -0.40791016817092896, "logits/rejected": -0.6935790777206421, "logps/chosen": -452.70001220703125, "logps/rejected": -502.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.267578125, "rewards/margins": 10.462499618530273, "rewards/rejected": -13.731249809265137, "step": 11650 }, { "epoch": 3.073273589878756, "grad_norm": 0.1916728269944976, "learning_rate": 2.3174749604638902e-07, "logits/chosen": -0.5506225824356079, "logits/rejected": -0.851855456829071, "logps/chosen": -400.45001220703125, "logps/rejected": -503.8999938964844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.7642579078674316, "rewards/margins": 11.171875, "rewards/rejected": -13.931249618530273, "step": 11660 }, { "epoch": 3.0759093305218768, "grad_norm": 0.7621023610231081, "learning_rate": 2.3108856088560884e-07, "logits/chosen": -0.33732908964157104, "logits/rejected": -0.6450439691543579, "logps/chosen": -387.04998779296875, "logps/rejected": -460.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.8695311546325684, "rewards/margins": 10.306249618530273, "rewards/rejected": -13.175000190734863, "step": 11670 }, { "epoch": 3.0785450711649975, "grad_norm": 0.4002101681714369, "learning_rate": 2.3042962572482867e-07, "logits/chosen": -0.4969238340854645, "logits/rejected": -0.921093761920929, "logps/chosen": -396.6000061035156, "logps/rejected": -512.2999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.135937452316284, "rewards/margins": 10.815625190734863, "rewards/rejected": -13.946874618530273, "step": 11680 }, { "epoch": 3.081180811808118, "grad_norm": 2.859766153456523, "learning_rate": 2.297706905640485e-07, "logits/chosen": -0.594042956829071, "logits/rejected": -0.67572021484375, "logps/chosen": -357.29998779296875, "logps/rejected": -436.25, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.698437452316284, "rewards/margins": 9.5546875, "rewards/rejected": -12.262499809265137, "step": 11690 }, { "epoch": 3.0838165524512386, "grad_norm": 0.5626516741660464, "learning_rate": 2.291117554032683e-07, "logits/chosen": -0.48175048828125, "logits/rejected": -0.8799804449081421, "logps/chosen": -440.20001220703125, "logps/rejected": -508.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.578125, "rewards/margins": 10.821874618530273, "rewards/rejected": -13.393750190734863, "step": 11700 }, { "epoch": 3.0864522930943594, "grad_norm": 0.328557028658191, "learning_rate": 2.2845282024248814e-07, "logits/chosen": -0.428955078125, "logits/rejected": -0.67205810546875, "logps/chosen": -413.8500061035156, "logps/rejected": -491.3999938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.5865235328674316, "rewards/margins": 10.8125, "rewards/rejected": -13.390625, "step": 11710 }, { "epoch": 3.08908803373748, "grad_norm": 0.4735318890746582, "learning_rate": 2.2779388508170794e-07, "logits/chosen": -0.608642578125, "logits/rejected": -0.8092041015625, "logps/chosen": -384.75, "logps/rejected": -487.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.484179735183716, "rewards/margins": 10.884374618530273, "rewards/rejected": -14.371874809265137, "step": 11720 }, { "epoch": 3.091723774380601, "grad_norm": 0.8237787289898072, "learning_rate": 2.2713494992092776e-07, "logits/chosen": -0.4367431700229645, "logits/rejected": -0.8699706792831421, "logps/chosen": -431.3999938964844, "logps/rejected": -482.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.021484375, "rewards/margins": 10.017187118530273, "rewards/rejected": -13.043749809265137, "step": 11730 }, { "epoch": 3.0943595150237218, "grad_norm": 1.1167375453013808, "learning_rate": 2.264760147601476e-07, "logits/chosen": -0.6204422116279602, "logits/rejected": -0.775585949420929, "logps/chosen": -375.0, "logps/rejected": -438.20001220703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.465625047683716, "rewards/margins": 10.557812690734863, "rewards/rejected": -14.024999618530273, "step": 11740 }, { "epoch": 3.0969952556668425, "grad_norm": 0.3162872121574132, "learning_rate": 2.258170795993674e-07, "logits/chosen": -0.41132813692092896, "logits/rejected": -0.752026379108429, "logps/chosen": -405.0, "logps/rejected": -482.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.797070264816284, "rewards/margins": 10.082812309265137, "rewards/rejected": -12.878125190734863, "step": 11750 }, { "epoch": 3.0996309963099633, "grad_norm": 1.3483119857756511, "learning_rate": 2.2515814443858723e-07, "logits/chosen": -0.562573254108429, "logits/rejected": -0.9664062261581421, "logps/chosen": -428.79998779296875, "logps/rejected": -485.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.174182176589966, "rewards/margins": 10.557812690734863, "rewards/rejected": -13.731249809265137, "step": 11760 }, { "epoch": 3.1022667369530836, "grad_norm": 0.19147418345811207, "learning_rate": 2.2449920927780705e-07, "logits/chosen": -0.5583114624023438, "logits/rejected": -0.90478515625, "logps/chosen": -382.6000061035156, "logps/rejected": -457.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.524609327316284, "rewards/margins": 10.621874809265137, "rewards/rejected": -14.149999618530273, "step": 11770 }, { "epoch": 3.1049024775962044, "grad_norm": 3.1723357256682454, "learning_rate": 2.2384027411702688e-07, "logits/chosen": -0.654736340045929, "logits/rejected": -0.7852538824081421, "logps/chosen": -419.3999938964844, "logps/rejected": -508.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.3265624046325684, "rewards/margins": 10.443750381469727, "rewards/rejected": -13.78125, "step": 11780 }, { "epoch": 3.107538218239325, "grad_norm": 9.410169458433069, "learning_rate": 2.231813389562467e-07, "logits/chosen": -0.500781238079071, "logits/rejected": -0.93524169921875, "logps/chosen": -388.54998779296875, "logps/rejected": -444.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.4027342796325684, "rewards/margins": 10.265625, "rewards/rejected": -13.668749809265137, "step": 11790 }, { "epoch": 3.110173958882446, "grad_norm": 1.8439786072106967, "learning_rate": 2.2252240379546653e-07, "logits/chosen": -0.5234130620956421, "logits/rejected": -0.867871105670929, "logps/chosen": -411.70001220703125, "logps/rejected": -484.6000061035156, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.328320264816284, "rewards/margins": 10.189062118530273, "rewards/rejected": -13.521875381469727, "step": 11800 }, { "epoch": 3.1128096995255667, "grad_norm": 1.1120731047602392, "learning_rate": 2.2186346863468632e-07, "logits/chosen": -0.5340820550918579, "logits/rejected": -0.890429675579071, "logps/chosen": -446.54998779296875, "logps/rejected": -504.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.757031202316284, "rewards/margins": 10.268750190734863, "rewards/rejected": -13.024999618530273, "step": 11810 }, { "epoch": 3.1154454401686875, "grad_norm": 0.9199440325141306, "learning_rate": 2.2120453347390617e-07, "logits/chosen": -0.50860595703125, "logits/rejected": -0.750866711139679, "logps/chosen": -411.1499938964844, "logps/rejected": -494.3999938964844, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.1363282203674316, "rewards/margins": 10.175000190734863, "rewards/rejected": -13.309374809265137, "step": 11820 }, { "epoch": 3.1180811808118083, "grad_norm": 6.323087430012066, "learning_rate": 2.2054559831312597e-07, "logits/chosen": -0.552319347858429, "logits/rejected": -0.817309558391571, "logps/chosen": -431.6000061035156, "logps/rejected": -485.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.2421875, "rewards/margins": 10.278124809265137, "rewards/rejected": -13.515625, "step": 11830 }, { "epoch": 3.1207169214549286, "grad_norm": 0.3771372309762929, "learning_rate": 2.198866631523458e-07, "logits/chosen": -0.4602294862270355, "logits/rejected": -0.812426745891571, "logps/chosen": -409.8999938964844, "logps/rejected": -532.75, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7734375, "rewards/margins": 11.212499618530273, "rewards/rejected": -13.975000381469727, "step": 11840 }, { "epoch": 3.1233526620980494, "grad_norm": 0.7163394018082153, "learning_rate": 2.1922772799156562e-07, "logits/chosen": -0.28380125761032104, "logits/rejected": -0.7694336175918579, "logps/chosen": -367.20001220703125, "logps/rejected": -449.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.7451171875, "rewards/margins": 10.415624618530273, "rewards/rejected": -13.159375190734863, "step": 11850 }, { "epoch": 3.12598840274117, "grad_norm": 1.894147787178133, "learning_rate": 2.1856879283078544e-07, "logits/chosen": -0.46879881620407104, "logits/rejected": -0.755786120891571, "logps/chosen": -418.54998779296875, "logps/rejected": -445.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.9796385765075684, "rewards/margins": 10.012499809265137, "rewards/rejected": -12.990625381469727, "step": 11860 }, { "epoch": 3.128624143384291, "grad_norm": 0.5437268094863168, "learning_rate": 2.1790985767000524e-07, "logits/chosen": -0.4514526426792145, "logits/rejected": -0.814379870891571, "logps/chosen": -390.20001220703125, "logps/rejected": -459.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.8550782203674316, "rewards/margins": 10.331250190734863, "rewards/rejected": -13.171875, "step": 11870 }, { "epoch": 3.1312598840274117, "grad_norm": 3.8429432866584703, "learning_rate": 2.172509225092251e-07, "logits/chosen": -0.4187255799770355, "logits/rejected": -0.682812511920929, "logps/chosen": -409.45001220703125, "logps/rejected": -482.70001220703125, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.663867235183716, "rewards/margins": 10.068750381469727, "rewards/rejected": -12.743749618530273, "step": 11880 }, { "epoch": 3.1338956246705325, "grad_norm": 1.6692510579884703, "learning_rate": 2.1659198734844491e-07, "logits/chosen": -0.49325865507125854, "logits/rejected": -0.6932128667831421, "logps/chosen": -397.95001220703125, "logps/rejected": -523.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.4927735328674316, "rewards/margins": 10.564062118530273, "rewards/rejected": -13.065625190734863, "step": 11890 }, { "epoch": 3.1365313653136533, "grad_norm": 1.3831670107528091, "learning_rate": 2.159330521876647e-07, "logits/chosen": -0.4898742735385895, "logits/rejected": -0.773193359375, "logps/chosen": -415.3999938964844, "logps/rejected": -479.79998779296875, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1083006858825684, "rewards/margins": 10.618749618530273, "rewards/rejected": -12.737500190734863, "step": 11900 }, { "epoch": 3.1391671059567736, "grad_norm": 0.5757111889537787, "learning_rate": 2.1527411702688456e-07, "logits/chosen": -0.36030882596969604, "logits/rejected": -0.6432129144668579, "logps/chosen": -379.79998779296875, "logps/rejected": -520.5, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.6937499046325684, "rewards/margins": 10.162500381469727, "rewards/rejected": -12.856249809265137, "step": 11910 }, { "epoch": 3.1418028465998944, "grad_norm": 0.08087016555613387, "learning_rate": 2.1461518186610436e-07, "logits/chosen": -0.569653332233429, "logits/rejected": -0.806201159954071, "logps/chosen": -406.25, "logps/rejected": -478.29998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.859375, "rewards/margins": 10.293749809265137, "rewards/rejected": -13.149999618530273, "step": 11920 }, { "epoch": 3.144438587243015, "grad_norm": 0.2867568681762307, "learning_rate": 2.1395624670532418e-07, "logits/chosen": -0.4675842225551605, "logits/rejected": -0.773913562297821, "logps/chosen": -409.0, "logps/rejected": -490.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.858593702316284, "rewards/margins": 10.378125190734863, "rewards/rejected": -13.234375, "step": 11930 }, { "epoch": 3.147074327886136, "grad_norm": 2.1095963016946975, "learning_rate": 2.13297311544544e-07, "logits/chosen": -0.2834716737270355, "logits/rejected": -0.767651379108429, "logps/chosen": -391.79998779296875, "logps/rejected": -448.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0023436546325684, "rewards/margins": 10.146875381469727, "rewards/rejected": -13.159375190734863, "step": 11940 }, { "epoch": 3.1497100685292567, "grad_norm": 1.9455276671049504, "learning_rate": 2.1263837638376383e-07, "logits/chosen": -0.4635376036167145, "logits/rejected": -0.699414074420929, "logps/chosen": -397.3999938964844, "logps/rejected": -507.6000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.275830030441284, "rewards/margins": 10.65625, "rewards/rejected": -13.918749809265137, "step": 11950 }, { "epoch": 3.1523458091723775, "grad_norm": 0.2820098210881693, "learning_rate": 2.1197944122298365e-07, "logits/chosen": -0.33662718534469604, "logits/rejected": -0.7266601324081421, "logps/chosen": -393.6000061035156, "logps/rejected": -484.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.2134766578674316, "rewards/margins": 10.753125190734863, "rewards/rejected": -13.962499618530273, "step": 11960 }, { "epoch": 3.1549815498154983, "grad_norm": 1.046161540779691, "learning_rate": 2.1132050606220348e-07, "logits/chosen": -0.76708984375, "logits/rejected": -0.7635742425918579, "logps/chosen": -400.3500061035156, "logps/rejected": -522.7999877929688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.926562547683716, "rewards/margins": 10.0234375, "rewards/rejected": -13.940625190734863, "step": 11970 }, { "epoch": 3.157617290458619, "grad_norm": 0.15251991619089042, "learning_rate": 2.1066157090142327e-07, "logits/chosen": -0.517993152141571, "logits/rejected": -0.7577148675918579, "logps/chosen": -404.75, "logps/rejected": -485.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.1832032203674316, "rewards/margins": 10.850000381469727, "rewards/rejected": -14.043749809265137, "step": 11980 }, { "epoch": 3.1602530311017394, "grad_norm": 0.5006556886200765, "learning_rate": 2.1000263574064312e-07, "logits/chosen": -0.49211424589157104, "logits/rejected": -0.810839831829071, "logps/chosen": -382.70001220703125, "logps/rejected": -462.79998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.8412108421325684, "rewards/margins": 11.103124618530273, "rewards/rejected": -13.946874618530273, "step": 11990 }, { "epoch": 3.16288877174486, "grad_norm": 5.475238162308619, "learning_rate": 2.0934370057986292e-07, "logits/chosen": -0.5560058355331421, "logits/rejected": -0.902099609375, "logps/chosen": -403.29998779296875, "logps/rejected": -470.79998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.5220704078674316, "rewards/margins": 10.667187690734863, "rewards/rejected": -13.171875, "step": 12000 }, { "epoch": 3.165524512387981, "grad_norm": 1.0149552454795139, "learning_rate": 2.0868476541908275e-07, "logits/chosen": -0.522430419921875, "logits/rejected": -0.791259765625, "logps/chosen": -429.20001220703125, "logps/rejected": -487.20001220703125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.9339842796325684, "rewards/margins": 10.699999809265137, "rewards/rejected": -13.640625, "step": 12010 }, { "epoch": 3.1681602530311017, "grad_norm": 0.9253797186899417, "learning_rate": 2.080258302583026e-07, "logits/chosen": -0.3718627989292145, "logits/rejected": -0.681140124797821, "logps/chosen": -386.95001220703125, "logps/rejected": -459.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.5250000953674316, "rewards/margins": 10.646875381469727, "rewards/rejected": -13.171875, "step": 12020 }, { "epoch": 3.1707959936742225, "grad_norm": 0.3051332275486411, "learning_rate": 2.073668950975224e-07, "logits/chosen": -0.40483397245407104, "logits/rejected": -0.788818359375, "logps/chosen": -403.8500061035156, "logps/rejected": -454.29998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.525195360183716, "rewards/margins": 10.321874618530273, "rewards/rejected": -12.840624809265137, "step": 12030 }, { "epoch": 3.1734317343173433, "grad_norm": 7.03687486199785, "learning_rate": 2.0670795993674222e-07, "logits/chosen": -0.48370361328125, "logits/rejected": -0.8089355230331421, "logps/chosen": -409.6000061035156, "logps/rejected": -470.3999938964844, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -2.7457032203674316, "rewards/margins": 10.234375, "rewards/rejected": -12.987500190734863, "step": 12040 }, { "epoch": 3.176067474960464, "grad_norm": 0.8425578087409242, "learning_rate": 2.0604902477596204e-07, "logits/chosen": -0.3374267518520355, "logits/rejected": -0.783398449420929, "logps/chosen": -419.6000061035156, "logps/rejected": -491.29998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.2484374046325684, "rewards/margins": 10.340624809265137, "rewards/rejected": -13.584375381469727, "step": 12050 }, { "epoch": 3.1787032156035844, "grad_norm": 0.3026248605268046, "learning_rate": 2.0539008961518186e-07, "logits/chosen": -0.30693358182907104, "logits/rejected": -0.533123791217804, "logps/chosen": -430.54998779296875, "logps/rejected": -495.3999938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.904492139816284, "rewards/margins": 10.446874618530273, "rewards/rejected": -13.353124618530273, "step": 12060 }, { "epoch": 3.181338956246705, "grad_norm": 5.394401672234664, "learning_rate": 2.0473115445440166e-07, "logits/chosen": -0.6628448367118835, "logits/rejected": -0.808422863483429, "logps/chosen": -384.04998779296875, "logps/rejected": -469.70001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.046875, "rewards/margins": 10.53125, "rewards/rejected": -13.581250190734863, "step": 12070 }, { "epoch": 3.183974696889826, "grad_norm": 0.9528718697202488, "learning_rate": 2.040722192936215e-07, "logits/chosen": -0.599536120891571, "logits/rejected": -0.8429931402206421, "logps/chosen": -398.3999938964844, "logps/rejected": -437.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.9959959983825684, "rewards/margins": 9.998437881469727, "rewards/rejected": -12.996874809265137, "step": 12080 }, { "epoch": 3.1866104375329467, "grad_norm": 1.2984287290895753, "learning_rate": 2.034132841328413e-07, "logits/chosen": -0.4469238221645355, "logits/rejected": -0.8770996332168579, "logps/chosen": -345.0, "logps/rejected": -433.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.036328077316284, "rewards/margins": 10.0, "rewards/rejected": -13.03125, "step": 12090 }, { "epoch": 3.1892461781760675, "grad_norm": 0.6739712730871614, "learning_rate": 2.0275434897206113e-07, "logits/chosen": -0.4742797911167145, "logits/rejected": -0.698291003704071, "logps/chosen": -368.1499938964844, "logps/rejected": -481.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.4117188453674316, "rewards/margins": 10.734375, "rewards/rejected": -13.153124809265137, "step": 12100 }, { "epoch": 3.1918819188191883, "grad_norm": 3.317589831649095, "learning_rate": 2.0209541381128096e-07, "logits/chosen": -0.4218383729457855, "logits/rejected": -0.5878661870956421, "logps/chosen": -378.54998779296875, "logps/rejected": -464.6000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.863085985183716, "rewards/margins": 10.315625190734863, "rewards/rejected": -13.181249618530273, "step": 12110 }, { "epoch": 3.194517659462309, "grad_norm": 0.37992758704970203, "learning_rate": 2.0143647865050078e-07, "logits/chosen": -0.36455076932907104, "logits/rejected": -0.7374511957168579, "logps/chosen": -400.5, "logps/rejected": -480.29998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.688671827316284, "rewards/margins": 10.709375381469727, "rewards/rejected": -13.387499809265137, "step": 12120 }, { "epoch": 3.19715340010543, "grad_norm": 0.2899732866503978, "learning_rate": 2.0077754348972058e-07, "logits/chosen": -0.21990355849266052, "logits/rejected": -0.728515625, "logps/chosen": -418.0, "logps/rejected": -448.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.151171922683716, "rewards/margins": 10.484375, "rewards/rejected": -12.634374618530273, "step": 12130 }, { "epoch": 3.19978914074855, "grad_norm": 0.13344255727695195, "learning_rate": 2.0011860832894043e-07, "logits/chosen": -0.49638062715530396, "logits/rejected": -0.811816394329071, "logps/chosen": -397.45001220703125, "logps/rejected": -444.8999938964844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.0238280296325684, "rewards/margins": 10.146875381469727, "rewards/rejected": -13.171875, "step": 12140 }, { "epoch": 3.202424881391671, "grad_norm": 0.32027056591661895, "learning_rate": 1.9945967316816023e-07, "logits/chosen": -0.6035003662109375, "logits/rejected": -0.8062499761581421, "logps/chosen": -399.3999938964844, "logps/rejected": -475.6000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.566210985183716, "rewards/margins": 10.214062690734863, "rewards/rejected": -12.784375190734863, "step": 12150 }, { "epoch": 3.2050606220347917, "grad_norm": 0.1328282396900884, "learning_rate": 1.9880073800738008e-07, "logits/chosen": -0.45147705078125, "logits/rejected": -0.7159668207168579, "logps/chosen": -393.29998779296875, "logps/rejected": -466.8999938964844, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.701171875, "rewards/margins": 10.2109375, "rewards/rejected": -12.921875, "step": 12160 }, { "epoch": 3.2076963626779125, "grad_norm": 3.1831558408222103, "learning_rate": 1.981418028465999e-07, "logits/chosen": -0.5398193597793579, "logits/rejected": -0.9081054925918579, "logps/chosen": -395.5, "logps/rejected": -474.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.7816405296325684, "rewards/margins": 10.581250190734863, "rewards/rejected": -13.362500190734863, "step": 12170 }, { "epoch": 3.2103321033210332, "grad_norm": 0.5095017796526398, "learning_rate": 1.974828676858197e-07, "logits/chosen": -0.4942993223667145, "logits/rejected": -0.915820300579071, "logps/chosen": -415.79998779296875, "logps/rejected": -465.0, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.7222657203674316, "rewards/margins": 10.199999809265137, "rewards/rejected": -12.915624618530273, "step": 12180 }, { "epoch": 3.212967843964154, "grad_norm": 4.011427983999144, "learning_rate": 1.9682393252503955e-07, "logits/chosen": -0.515869140625, "logits/rejected": -0.71630859375, "logps/chosen": -407.1499938964844, "logps/rejected": -451.1000061035156, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.435742139816284, "rewards/margins": 9.959375381469727, "rewards/rejected": -12.399999618530273, "step": 12190 }, { "epoch": 3.215603584607275, "grad_norm": 0.3089517694812993, "learning_rate": 1.9616499736425934e-07, "logits/chosen": -0.4322753846645355, "logits/rejected": -0.727099597454071, "logps/chosen": -437.8999938964844, "logps/rejected": -520.0999755859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.3466796875, "rewards/margins": 10.193750381469727, "rewards/rejected": -12.546875, "step": 12200 }, { "epoch": 3.218239325250395, "grad_norm": 0.6974251581410676, "learning_rate": 1.9550606220347917e-07, "logits/chosen": -0.723828136920929, "logits/rejected": -0.831927478313446, "logps/chosen": -344.25, "logps/rejected": -440.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.997265577316284, "rewards/margins": 10.278124809265137, "rewards/rejected": -13.268750190734863, "step": 12210 }, { "epoch": 3.220875065893516, "grad_norm": 1.541681893280435, "learning_rate": 1.94847127042699e-07, "logits/chosen": -0.554302990436554, "logits/rejected": -0.6414550542831421, "logps/chosen": -424.0, "logps/rejected": -528.2000122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.4166016578674316, "rewards/margins": 10.356249809265137, "rewards/rejected": -13.784375190734863, "step": 12220 }, { "epoch": 3.2235108065366367, "grad_norm": 0.28043743724839926, "learning_rate": 1.9418819188191882e-07, "logits/chosen": -0.2950195372104645, "logits/rejected": -0.7563720941543579, "logps/chosen": -449.5, "logps/rejected": -498.79998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.287109375, "rewards/margins": 10.421875, "rewards/rejected": -13.71875, "step": 12230 }, { "epoch": 3.2261465471797575, "grad_norm": 0.2794717844038265, "learning_rate": 1.9352925672113861e-07, "logits/chosen": -0.47114259004592896, "logits/rejected": -0.736572265625, "logps/chosen": -392.3500061035156, "logps/rejected": -472.70001220703125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.285351514816284, "rewards/margins": 10.493749618530273, "rewards/rejected": -13.790624618530273, "step": 12240 }, { "epoch": 3.2287822878228782, "grad_norm": 0.0998049868132024, "learning_rate": 1.9287032156035846e-07, "logits/chosen": -0.5974884033203125, "logits/rejected": -0.7979667782783508, "logps/chosen": -381.0, "logps/rejected": -445.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.085253953933716, "rewards/margins": 10.331250190734863, "rewards/rejected": -13.418749809265137, "step": 12250 }, { "epoch": 3.231418028465999, "grad_norm": 0.249905180087387, "learning_rate": 1.9221138639957826e-07, "logits/chosen": -0.4251708984375, "logits/rejected": -0.755664050579071, "logps/chosen": -391.70001220703125, "logps/rejected": -454.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.988085985183716, "rewards/margins": 10.546875, "rewards/rejected": -13.528124809265137, "step": 12260 }, { "epoch": 3.23405376910912, "grad_norm": 4.7959667011012455, "learning_rate": 1.9155245123879808e-07, "logits/chosen": -0.42938232421875, "logits/rejected": -0.8097168207168579, "logps/chosen": -403.20001220703125, "logps/rejected": -452.29998779296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.5892577171325684, "rewards/margins": 10.396875381469727, "rewards/rejected": -12.975000381469727, "step": 12270 }, { "epoch": 3.2366895097522406, "grad_norm": 0.2641881984258528, "learning_rate": 1.908935160780179e-07, "logits/chosen": -0.5503906011581421, "logits/rejected": -0.8531738519668579, "logps/chosen": -352.75, "logps/rejected": -450.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.587109327316284, "rewards/margins": 10.421875, "rewards/rejected": -13.012499809265137, "step": 12280 }, { "epoch": 3.239325250395361, "grad_norm": 0.17410138631907124, "learning_rate": 1.9023458091723773e-07, "logits/chosen": -0.676025390625, "logits/rejected": -0.880664050579071, "logps/chosen": -402.70001220703125, "logps/rejected": -458.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.2613282203674316, "rewards/margins": 11.0625, "rewards/rejected": -14.334375381469727, "step": 12290 }, { "epoch": 3.2419609910384817, "grad_norm": 5.531118708442506, "learning_rate": 1.8957564575645758e-07, "logits/chosen": -0.560375988483429, "logits/rejected": -0.8296874761581421, "logps/chosen": -428.75, "logps/rejected": -475.8999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7126526832580566, "rewards/margins": 10.381250381469727, "rewards/rejected": -13.096875190734863, "step": 12300 }, { "epoch": 3.2445967316816025, "grad_norm": 0.2728390187682742, "learning_rate": 1.8891671059567738e-07, "logits/chosen": -0.3467468321323395, "logits/rejected": -0.657470703125, "logps/chosen": -395.04998779296875, "logps/rejected": -452.29998779296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.004687547683716, "rewards/margins": 10.878125190734863, "rewards/rejected": -13.881250381469727, "step": 12310 }, { "epoch": 3.2472324723247232, "grad_norm": 0.9785795410611283, "learning_rate": 1.882577754348972e-07, "logits/chosen": -0.4471191465854645, "logits/rejected": -0.858447253704071, "logps/chosen": -403.8999938964844, "logps/rejected": -450.70001220703125, "loss": 0.0062, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.432812452316284, "rewards/margins": 9.964062690734863, "rewards/rejected": -13.396875381469727, "step": 12320 }, { "epoch": 3.249868212967844, "grad_norm": 0.24922560003597524, "learning_rate": 1.8759884027411703e-07, "logits/chosen": -0.39473265409469604, "logits/rejected": -0.7456420660018921, "logps/chosen": -385.04998779296875, "logps/rejected": -442.70001220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.073437452316284, "rewards/margins": 10.175000190734863, "rewards/rejected": -13.231249809265137, "step": 12330 }, { "epoch": 3.252503953610965, "grad_norm": 2.368851701001132, "learning_rate": 1.8693990511333685e-07, "logits/chosen": -0.5960937738418579, "logits/rejected": -0.853710949420929, "logps/chosen": -393.1000061035156, "logps/rejected": -448.1000061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.627978563308716, "rewards/margins": 10.096875190734863, "rewards/rejected": -12.721875190734863, "step": 12340 }, { "epoch": 3.2551396942540856, "grad_norm": 0.5248751049757521, "learning_rate": 1.8628096995255665e-07, "logits/chosen": -0.21281738579273224, "logits/rejected": -0.73681640625, "logps/chosen": -376.6499938964844, "logps/rejected": -477.5, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.919921875, "rewards/margins": 10.545312881469727, "rewards/rejected": -13.462499618530273, "step": 12350 }, { "epoch": 3.257775434897206, "grad_norm": 0.15802326649891338, "learning_rate": 1.856220347917765e-07, "logits/chosen": -0.46016234159469604, "logits/rejected": -0.8590332269668579, "logps/chosen": -462.8999938964844, "logps/rejected": -488.1000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.6419920921325684, "rewards/margins": 10.498437881469727, "rewards/rejected": -13.134374618530273, "step": 12360 }, { "epoch": 3.2604111755403267, "grad_norm": 1.0222795382543068, "learning_rate": 1.849630996309963e-07, "logits/chosen": -0.4399780333042145, "logits/rejected": -0.8009277582168579, "logps/chosen": -417.3999938964844, "logps/rejected": -510.20001220703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.485546827316284, "rewards/margins": 10.321874618530273, "rewards/rejected": -13.793749809265137, "step": 12370 }, { "epoch": 3.2630469161834474, "grad_norm": 2.1347830948187037, "learning_rate": 1.8430416447021612e-07, "logits/chosen": -0.667285144329071, "logits/rejected": -0.9449218511581421, "logps/chosen": -439.6000061035156, "logps/rejected": -487.79998779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.382031202316284, "rewards/margins": 9.776562690734863, "rewards/rejected": -13.153124809265137, "step": 12380 }, { "epoch": 3.265682656826568, "grad_norm": 0.05377566584115247, "learning_rate": 1.8364522930943594e-07, "logits/chosen": -0.4920410215854645, "logits/rejected": -0.766735851764679, "logps/chosen": -390.29998779296875, "logps/rejected": -475.3999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.135937452316284, "rewards/margins": 10.90625, "rewards/rejected": -14.053125381469727, "step": 12390 }, { "epoch": 3.268318397469689, "grad_norm": 1.058639138558752, "learning_rate": 1.8298629414865577e-07, "logits/chosen": -0.529205322265625, "logits/rejected": -0.8426758050918579, "logps/chosen": -436.0, "logps/rejected": -499.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.523632764816284, "rewards/margins": 10.259374618530273, "rewards/rejected": -13.771875381469727, "step": 12400 }, { "epoch": 3.2709541381128098, "grad_norm": 0.27779855117669133, "learning_rate": 1.8232735898787556e-07, "logits/chosen": -0.4868667721748352, "logits/rejected": -0.649121105670929, "logps/chosen": -392.1000061035156, "logps/rejected": -464.20001220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.1449217796325684, "rewards/margins": 10.103124618530273, "rewards/rejected": -13.253125190734863, "step": 12410 }, { "epoch": 3.2735898787559305, "grad_norm": 0.10737159393755183, "learning_rate": 1.8166842382709541e-07, "logits/chosen": -0.49085694551467896, "logits/rejected": -0.7889648675918579, "logps/chosen": -398.29998779296875, "logps/rejected": -455.29998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9453125, "rewards/margins": 10.368749618530273, "rewards/rejected": -13.300000190734863, "step": 12420 }, { "epoch": 3.2762256193990513, "grad_norm": 1.2867140930060115, "learning_rate": 1.810094886663152e-07, "logits/chosen": -0.4483886659145355, "logits/rejected": -0.8243652582168579, "logps/chosen": -429.70001220703125, "logps/rejected": -504.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0455565452575684, "rewards/margins": 10.143750190734863, "rewards/rejected": -13.199999809265137, "step": 12430 }, { "epoch": 3.2788613600421717, "grad_norm": 0.33454824970044456, "learning_rate": 1.8035055350553504e-07, "logits/chosen": -0.67724609375, "logits/rejected": -0.9068359136581421, "logps/chosen": -392.0, "logps/rejected": -453.1000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3330078125, "rewards/margins": 10.53125, "rewards/rejected": -13.875, "step": 12440 }, { "epoch": 3.2814971006852924, "grad_norm": 1.182319636021643, "learning_rate": 1.7969161834475489e-07, "logits/chosen": -0.679901123046875, "logits/rejected": -0.9201294183731079, "logps/chosen": -362.54998779296875, "logps/rejected": -441.20001220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.0933594703674316, "rewards/margins": 10.090624809265137, "rewards/rejected": -13.184374809265137, "step": 12450 }, { "epoch": 3.284132841328413, "grad_norm": 0.49583952895903677, "learning_rate": 1.7903268318397468e-07, "logits/chosen": -0.40104979276657104, "logits/rejected": -0.7643798589706421, "logps/chosen": -364.0249938964844, "logps/rejected": -444.0, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.9349608421325684, "rewards/margins": 10.412500381469727, "rewards/rejected": -13.350000381469727, "step": 12460 }, { "epoch": 3.286768581971534, "grad_norm": 0.32727091713857764, "learning_rate": 1.783737480231945e-07, "logits/chosen": -0.46489256620407104, "logits/rejected": -0.8785156011581421, "logps/chosen": -379.42498779296875, "logps/rejected": -458.1000061035156, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6937499046325684, "rewards/margins": 10.175000190734863, "rewards/rejected": -12.878125190734863, "step": 12470 }, { "epoch": 3.2894043226146548, "grad_norm": 0.2769972377088835, "learning_rate": 1.7771481286241433e-07, "logits/chosen": -0.4413085877895355, "logits/rejected": -0.809374988079071, "logps/chosen": -402.1000061035156, "logps/rejected": -476.1000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.5960936546325684, "rewards/margins": 10.740625381469727, "rewards/rejected": -13.334375381469727, "step": 12480 }, { "epoch": 3.2920400632577755, "grad_norm": 0.11018403818400138, "learning_rate": 1.7705587770163415e-07, "logits/chosen": -0.3993896543979645, "logits/rejected": -0.6946655511856079, "logps/chosen": -384.0, "logps/rejected": -473.70001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.8783202171325684, "rewards/margins": 10.506250381469727, "rewards/rejected": -13.387499809265137, "step": 12490 }, { "epoch": 3.2946758039008963, "grad_norm": 2.8499655396009906, "learning_rate": 1.7639694254085398e-07, "logits/chosen": -0.35713499784469604, "logits/rejected": -0.842578113079071, "logps/chosen": -379.0, "logps/rejected": -497.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.064648389816284, "rewards/margins": 11.684374809265137, "rewards/rejected": -14.737500190734863, "step": 12500 }, { "epoch": 3.2973115445440166, "grad_norm": 1.1355142975055463, "learning_rate": 1.757380073800738e-07, "logits/chosen": -0.549395740032196, "logits/rejected": -0.9130859375, "logps/chosen": -378.1000061035156, "logps/rejected": -437.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.1265625953674316, "rewards/margins": 10.199999809265137, "rewards/rejected": -13.321874618530273, "step": 12510 }, { "epoch": 3.2999472851871374, "grad_norm": 4.917668613630379, "learning_rate": 1.750790722192936e-07, "logits/chosen": -0.570452868938446, "logits/rejected": -0.8832031488418579, "logps/chosen": -384.1499938964844, "logps/rejected": -490.6000061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.608203172683716, "rewards/margins": 10.34375, "rewards/rejected": -13.946874618530273, "step": 12520 }, { "epoch": 3.302583025830258, "grad_norm": 0.6938076396240372, "learning_rate": 1.7442013705851345e-07, "logits/chosen": -0.587109386920929, "logits/rejected": -0.8272460699081421, "logps/chosen": -481.8999938964844, "logps/rejected": -513.2999877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.0333008766174316, "rewards/margins": 10.600000381469727, "rewards/rejected": -13.637499809265137, "step": 12530 }, { "epoch": 3.305218766473379, "grad_norm": 0.5137879412521972, "learning_rate": 1.7376120189773325e-07, "logits/chosen": -0.5897461175918579, "logits/rejected": -0.9783691167831421, "logps/chosen": -400.25, "logps/rejected": -428.20001220703125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.0693359375, "rewards/margins": 10.146875381469727, "rewards/rejected": -13.221875190734863, "step": 12540 }, { "epoch": 3.3078545071164998, "grad_norm": 0.8735623265537625, "learning_rate": 1.7310226673695307e-07, "logits/chosen": -0.526721179485321, "logits/rejected": -0.9326171875, "logps/chosen": -425.8999938964844, "logps/rejected": -491.1000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.205273389816284, "rewards/margins": 11.0625, "rewards/rejected": -14.268750190734863, "step": 12550 }, { "epoch": 3.3104902477596205, "grad_norm": 0.1314491200600672, "learning_rate": 1.724433315761729e-07, "logits/chosen": -0.429617315530777, "logits/rejected": -0.764331042766571, "logps/chosen": -427.54998779296875, "logps/rejected": -479.6000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2835936546325684, "rewards/margins": 10.540624618530273, "rewards/rejected": -13.834375381469727, "step": 12560 }, { "epoch": 3.3131259884027413, "grad_norm": 21.129738951077645, "learning_rate": 1.7178439641539272e-07, "logits/chosen": -0.580810546875, "logits/rejected": -0.72900390625, "logps/chosen": -383.04998779296875, "logps/rejected": -443.29998779296875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.4976563453674316, "rewards/margins": 10.28125, "rewards/rejected": -13.78125, "step": 12570 }, { "epoch": 3.315761729045862, "grad_norm": 0.6668313914189601, "learning_rate": 1.7112546125461254e-07, "logits/chosen": -0.579394519329071, "logits/rejected": -0.894335925579071, "logps/chosen": -395.3500061035156, "logps/rejected": -459.79998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.1839842796325684, "rewards/margins": 10.996874809265137, "rewards/rejected": -14.181249618530273, "step": 12580 }, { "epoch": 3.3183974696889824, "grad_norm": 0.9662356797639529, "learning_rate": 1.7046652609383237e-07, "logits/chosen": -0.6278320550918579, "logits/rejected": -0.8397461175918579, "logps/chosen": -394.04998779296875, "logps/rejected": -492.0, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.5042967796325684, "rewards/margins": 10.34375, "rewards/rejected": -13.859375, "step": 12590 }, { "epoch": 3.321033210332103, "grad_norm": 0.19669125589302588, "learning_rate": 1.698075909330522e-07, "logits/chosen": -0.49378663301467896, "logits/rejected": -0.8778320550918579, "logps/chosen": -404.3999938964844, "logps/rejected": -474.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.6558594703674316, "rewards/margins": 10.631250381469727, "rewards/rejected": -14.290624618530273, "step": 12600 }, { "epoch": 3.323668950975224, "grad_norm": 2.4433415435747268, "learning_rate": 1.69148655772272e-07, "logits/chosen": -0.633544921875, "logits/rejected": -0.9325195550918579, "logps/chosen": -422.1000061035156, "logps/rejected": -452.8999938964844, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.502734422683716, "rewards/margins": 9.828125, "rewards/rejected": -13.340624809265137, "step": 12610 }, { "epoch": 3.3263046916183447, "grad_norm": 0.2303675684324055, "learning_rate": 1.6848972061149184e-07, "logits/chosen": -0.5102294683456421, "logits/rejected": -0.875292956829071, "logps/chosen": -437.3999938964844, "logps/rejected": -478.79998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.1333985328674316, "rewards/margins": 10.506250381469727, "rewards/rejected": -13.643750190734863, "step": 12620 }, { "epoch": 3.3289404322614655, "grad_norm": 0.4412556261278976, "learning_rate": 1.6783078545071163e-07, "logits/chosen": -0.321533203125, "logits/rejected": -0.8543456792831421, "logps/chosen": -379.3500061035156, "logps/rejected": -415.20001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.7105712890625, "rewards/margins": 10.220312118530273, "rewards/rejected": -12.925000190734863, "step": 12630 }, { "epoch": 3.3315761729045863, "grad_norm": 0.5549962687704213, "learning_rate": 1.6717185028993146e-07, "logits/chosen": -0.4609130918979645, "logits/rejected": -0.7533324956893921, "logps/chosen": -408.3500061035156, "logps/rejected": -460.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.478515625, "rewards/margins": 10.151562690734863, "rewards/rejected": -12.625, "step": 12640 }, { "epoch": 3.334211913547707, "grad_norm": 0.4808298044589174, "learning_rate": 1.6651291512915128e-07, "logits/chosen": -0.480978399515152, "logits/rejected": -0.839404284954071, "logps/chosen": -411.6000061035156, "logps/rejected": -491.79998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.901562452316284, "rewards/margins": 10.521875381469727, "rewards/rejected": -13.425000190734863, "step": 12650 }, { "epoch": 3.3368476541908274, "grad_norm": 29.681787688721535, "learning_rate": 1.658539799683711e-07, "logits/chosen": -0.46275633573532104, "logits/rejected": -0.912915050983429, "logps/chosen": -390.8500061035156, "logps/rejected": -494.0, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.750781297683716, "rewards/margins": 10.774999618530273, "rewards/rejected": -13.521875381469727, "step": 12660 }, { "epoch": 3.339483394833948, "grad_norm": 0.9089973807263494, "learning_rate": 1.6519504480759093e-07, "logits/chosen": -0.5285278558731079, "logits/rejected": -0.8499511480331421, "logps/chosen": -409.5, "logps/rejected": -488.70001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.047656297683716, "rewards/margins": 10.321874618530273, "rewards/rejected": -13.378125190734863, "step": 12670 }, { "epoch": 3.342119135477069, "grad_norm": 1.1546274770062077, "learning_rate": 1.6453610964681075e-07, "logits/chosen": -0.6407226324081421, "logits/rejected": -0.802050769329071, "logps/chosen": -432.75, "logps/rejected": -533.2999877929688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.536328077316284, "rewards/margins": 10.853124618530273, "rewards/rejected": -14.390625, "step": 12680 }, { "epoch": 3.3447548761201897, "grad_norm": 0.6361419980416559, "learning_rate": 1.6387717448603055e-07, "logits/chosen": -0.5902465581893921, "logits/rejected": -0.862500011920929, "logps/chosen": -432.1000061035156, "logps/rejected": -501.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0972657203674316, "rewards/margins": 10.734375, "rewards/rejected": -13.828125, "step": 12690 }, { "epoch": 3.3473906167633105, "grad_norm": 10.63922210173501, "learning_rate": 1.632182393252504e-07, "logits/chosen": -0.6990112066268921, "logits/rejected": -0.986035168170929, "logps/chosen": -412.70001220703125, "logps/rejected": -501.0, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.4273438453674316, "rewards/margins": 10.387499809265137, "rewards/rejected": -13.818750381469727, "step": 12700 }, { "epoch": 3.3500263574064313, "grad_norm": 0.1899338709793997, "learning_rate": 1.625593041644702e-07, "logits/chosen": -0.3824523985385895, "logits/rejected": -0.971630871295929, "logps/chosen": -438.29998779296875, "logps/rejected": -469.6000061035156, "loss": 0.0097, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.38623046875, "rewards/margins": 10.796875, "rewards/rejected": -14.181249618530273, "step": 12710 }, { "epoch": 3.352662098049552, "grad_norm": 1.7331139232480182, "learning_rate": 1.6190036900369002e-07, "logits/chosen": -0.4265502989292145, "logits/rejected": -1.007226586341858, "logps/chosen": -439.5, "logps/rejected": -478.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.2035155296325684, "rewards/margins": 10.503125190734863, "rewards/rejected": -13.690625190734863, "step": 12720 }, { "epoch": 3.355297838692673, "grad_norm": 0.8569591502611655, "learning_rate": 1.6124143384290987e-07, "logits/chosen": -0.5400635004043579, "logits/rejected": -0.912353515625, "logps/chosen": -459.8999938964844, "logps/rejected": -510.20001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.6732420921325684, "rewards/margins": 10.806249618530273, "rewards/rejected": -13.478124618530273, "step": 12730 }, { "epoch": 3.357933579335793, "grad_norm": 0.06195331049505722, "learning_rate": 1.6058249868212967e-07, "logits/chosen": -0.723583996295929, "logits/rejected": -0.8448241949081421, "logps/chosen": -410.79998779296875, "logps/rejected": -506.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.0634765625, "rewards/margins": 10.643750190734863, "rewards/rejected": -13.709375381469727, "step": 12740 }, { "epoch": 3.360569319978914, "grad_norm": 0.447512190248849, "learning_rate": 1.599235635213495e-07, "logits/chosen": -0.6531982421875, "logits/rejected": -0.850354015827179, "logps/chosen": -368.45001220703125, "logps/rejected": -480.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.2035155296325684, "rewards/margins": 10.578125, "rewards/rejected": -13.778124809265137, "step": 12750 }, { "epoch": 3.3632050606220347, "grad_norm": 0.13793872273347663, "learning_rate": 1.5926462836056932e-07, "logits/chosen": -0.5091797113418579, "logits/rejected": -0.906994640827179, "logps/chosen": -426.6000061035156, "logps/rejected": -420.5, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.7535157203674316, "rewards/margins": 10.295312881469727, "rewards/rejected": -13.050000190734863, "step": 12760 }, { "epoch": 3.3658408012651555, "grad_norm": 0.4131556101588039, "learning_rate": 1.5860569319978914e-07, "logits/chosen": -0.39808350801467896, "logits/rejected": -0.863037109375, "logps/chosen": -383.20001220703125, "logps/rejected": -482.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7972655296325684, "rewards/margins": 9.981249809265137, "rewards/rejected": -12.778124809265137, "step": 12770 }, { "epoch": 3.3684765419082763, "grad_norm": 1.4902104805416956, "learning_rate": 1.5794675803900894e-07, "logits/chosen": -0.536120593547821, "logits/rejected": -0.813671886920929, "logps/chosen": -345.70001220703125, "logps/rejected": -450.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.963671922683716, "rewards/margins": 9.715624809265137, "rewards/rejected": -12.684374809265137, "step": 12780 }, { "epoch": 3.371112282551397, "grad_norm": 1.2438930588462838, "learning_rate": 1.572878228782288e-07, "logits/chosen": -0.32568359375, "logits/rejected": -0.7916198968887329, "logps/chosen": -391.75, "logps/rejected": -458.5, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.4535155296325684, "rewards/margins": 10.462499618530273, "rewards/rejected": -12.912500381469727, "step": 12790 }, { "epoch": 3.373748023194518, "grad_norm": 0.4421013137886415, "learning_rate": 1.5662888771744859e-07, "logits/chosen": -0.4959716796875, "logits/rejected": -0.801953136920929, "logps/chosen": -406.6000061035156, "logps/rejected": -471.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.797656297683716, "rewards/margins": 10.712499618530273, "rewards/rejected": -13.515625, "step": 12800 }, { "epoch": 3.376383763837638, "grad_norm": 0.15400517477495532, "learning_rate": 1.559699525566684e-07, "logits/chosen": -0.6923156976699829, "logits/rejected": -0.801220715045929, "logps/chosen": -417.3999938964844, "logps/rejected": -484.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.711718797683716, "rewards/margins": 10.001562118530273, "rewards/rejected": -12.71875, "step": 12810 }, { "epoch": 3.379019504480759, "grad_norm": 0.21684913921751436, "learning_rate": 1.5531101739588823e-07, "logits/chosen": -0.560076892375946, "logits/rejected": -0.83984375, "logps/chosen": -433.95001220703125, "logps/rejected": -474.3999938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.001171827316284, "rewards/margins": 10.110937118530273, "rewards/rejected": -13.106249809265137, "step": 12820 }, { "epoch": 3.3816552451238797, "grad_norm": 0.2921677100783785, "learning_rate": 1.5465208223510806e-07, "logits/chosen": -0.5479491949081421, "logits/rejected": -0.846386730670929, "logps/chosen": -363.29998779296875, "logps/rejected": -491.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.2164063453674316, "rewards/margins": 10.993749618530273, "rewards/rejected": -14.212499618530273, "step": 12830 }, { "epoch": 3.3842909857670005, "grad_norm": 0.5402143173162686, "learning_rate": 1.5399314707432785e-07, "logits/chosen": -0.49578857421875, "logits/rejected": -0.8260742425918579, "logps/chosen": -385.1499938964844, "logps/rejected": -454.3999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.5132813453674316, "rewards/margins": 10.431249618530273, "rewards/rejected": -12.949999809265137, "step": 12840 }, { "epoch": 3.3869267264101213, "grad_norm": 1.0629638367106402, "learning_rate": 1.533342119135477e-07, "logits/chosen": -0.4474731385707855, "logits/rejected": -0.822314441204071, "logps/chosen": -413.8999938964844, "logps/rejected": -469.8999938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.277148485183716, "rewards/margins": 10.1484375, "rewards/rejected": -13.443750381469727, "step": 12850 }, { "epoch": 3.389562467053242, "grad_norm": 0.513500374161518, "learning_rate": 1.5267527675276753e-07, "logits/chosen": -0.7314208745956421, "logits/rejected": -0.849609375, "logps/chosen": -395.8500061035156, "logps/rejected": -487.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.9730467796325684, "rewards/margins": 10.715624809265137, "rewards/rejected": -13.690625190734863, "step": 12860 }, { "epoch": 3.392198207696363, "grad_norm": 0.7379215794137434, "learning_rate": 1.5201634159198735e-07, "logits/chosen": -0.5418701171875, "logits/rejected": -0.9275878667831421, "logps/chosen": -430.70001220703125, "logps/rejected": -465.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4671874046325684, "rewards/margins": 10.774999618530273, "rewards/rejected": -14.246874809265137, "step": 12870 }, { "epoch": 3.3948339483394836, "grad_norm": 0.5169033697835456, "learning_rate": 1.5135740643120718e-07, "logits/chosen": -0.468017578125, "logits/rejected": -0.86572265625, "logps/chosen": -373.0, "logps/rejected": -457.8999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.1480469703674316, "rewards/margins": 10.478124618530273, "rewards/rejected": -13.631250381469727, "step": 12880 }, { "epoch": 3.397469688982604, "grad_norm": 0.20571421496388895, "learning_rate": 1.5069847127042697e-07, "logits/chosen": -0.589160144329071, "logits/rejected": -0.925000011920929, "logps/chosen": -419.3999938964844, "logps/rejected": -503.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.15625, "rewards/margins": 11.006250381469727, "rewards/rejected": -14.15625, "step": 12890 }, { "epoch": 3.4001054296257247, "grad_norm": 0.5131521504344106, "learning_rate": 1.5003953610964682e-07, "logits/chosen": -0.623974621295929, "logits/rejected": -0.877636730670929, "logps/chosen": -399.04998779296875, "logps/rejected": -487.70001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.74609375, "rewards/margins": 10.2578125, "rewards/rejected": -14.003125190734863, "step": 12900 }, { "epoch": 3.4027411702688455, "grad_norm": 0.8253555042830288, "learning_rate": 1.4938060094886662e-07, "logits/chosen": -0.4708251953125, "logits/rejected": -0.8539062738418579, "logps/chosen": -429.25, "logps/rejected": -475.70001220703125, "loss": 0.0412, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5023436546325684, "rewards/margins": 10.532812118530273, "rewards/rejected": -14.037500381469727, "step": 12910 }, { "epoch": 3.4053769109119663, "grad_norm": 3.3444657859400717, "learning_rate": 1.4872166578808644e-07, "logits/chosen": -0.5986328125, "logits/rejected": -0.819323718547821, "logps/chosen": -392.25, "logps/rejected": -445.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.989062547683716, "rewards/margins": 10.346875190734863, "rewards/rejected": -13.350000381469727, "step": 12920 }, { "epoch": 3.408012651555087, "grad_norm": 0.2336771696125587, "learning_rate": 1.4806273062730627e-07, "logits/chosen": -0.65625, "logits/rejected": -0.8541015386581421, "logps/chosen": -455.45001220703125, "logps/rejected": -501.6000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.0894532203674316, "rewards/margins": 10.657812118530273, "rewards/rejected": -13.743749618530273, "step": 12930 }, { "epoch": 3.410648392198208, "grad_norm": 1.4004634546059855, "learning_rate": 1.474037954665261e-07, "logits/chosen": -0.6142822504043579, "logits/rejected": -0.9883788824081421, "logps/chosen": -367.20001220703125, "logps/rejected": -410.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3026671409606934, "rewards/margins": 10.446874618530273, "rewards/rejected": -13.762499809265137, "step": 12940 }, { "epoch": 3.4132841328413286, "grad_norm": 1.0694101798850015, "learning_rate": 1.467448603057459e-07, "logits/chosen": -0.512524425983429, "logits/rejected": -0.715624988079071, "logps/chosen": -450.6000061035156, "logps/rejected": -504.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.783203125, "rewards/margins": 10.995312690734863, "rewards/rejected": -13.774999618530273, "step": 12950 }, { "epoch": 3.415919873484449, "grad_norm": 1.1237219710746174, "learning_rate": 1.4608592514496574e-07, "logits/chosen": -0.6103271245956421, "logits/rejected": -0.924609363079071, "logps/chosen": -377.5, "logps/rejected": -420.3999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.9007811546325684, "rewards/margins": 10.1875, "rewards/rejected": -13.081250190734863, "step": 12960 }, { "epoch": 3.4185556141275697, "grad_norm": 0.11460834573557896, "learning_rate": 1.4542698998418554e-07, "logits/chosen": -0.5529540777206421, "logits/rejected": -0.8494628667831421, "logps/chosen": -420.75, "logps/rejected": -495.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.015625, "rewards/margins": 10.649999618530273, "rewards/rejected": -13.659375190734863, "step": 12970 }, { "epoch": 3.4211913547706905, "grad_norm": 1.679586947651783, "learning_rate": 1.4476805482340536e-07, "logits/chosen": -0.6217285394668579, "logits/rejected": -0.7572265863418579, "logps/chosen": -406.20001220703125, "logps/rejected": -476.79998779296875, "loss": 0.0104, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.660351514816284, "rewards/margins": 10.089062690734863, "rewards/rejected": -12.746874809265137, "step": 12980 }, { "epoch": 3.4238270954138113, "grad_norm": 0.3703054218142828, "learning_rate": 1.4410911966262518e-07, "logits/chosen": -0.603759765625, "logits/rejected": -0.8922119140625, "logps/chosen": -412.5, "logps/rejected": -491.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.223437547683716, "rewards/margins": 10.643750190734863, "rewards/rejected": -13.871874809265137, "step": 12990 }, { "epoch": 3.426462836056932, "grad_norm": 1.8090660497666682, "learning_rate": 1.43450184501845e-07, "logits/chosen": -0.600634753704071, "logits/rejected": -0.899218738079071, "logps/chosen": -358.20001220703125, "logps/rejected": -458.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.1060547828674316, "rewards/margins": 10.703125, "rewards/rejected": -13.8125, "step": 13000 }, { "epoch": 3.429098576700053, "grad_norm": 0.762982571793326, "learning_rate": 1.4279124934106486e-07, "logits/chosen": -0.795947253704071, "logits/rejected": -0.812548816204071, "logps/chosen": -382.8999938964844, "logps/rejected": -484.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.1929688453674316, "rewards/margins": 10.800000190734863, "rewards/rejected": -14.0, "step": 13010 }, { "epoch": 3.4317343173431736, "grad_norm": 5.287916116804272, "learning_rate": 1.4213231418028466e-07, "logits/chosen": -0.708447277545929, "logits/rejected": -0.913684070110321, "logps/chosen": -381.79998779296875, "logps/rejected": -449.70001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.9761719703674316, "rewards/margins": 10.353124618530273, "rewards/rejected": -13.331250190734863, "step": 13020 }, { "epoch": 3.4343700579862944, "grad_norm": 0.5929746119839656, "learning_rate": 1.4147337901950448e-07, "logits/chosen": -0.44265443086624146, "logits/rejected": -0.795654296875, "logps/chosen": -382.3999938964844, "logps/rejected": -469.5, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -3.3218750953674316, "rewards/margins": 10.403124809265137, "rewards/rejected": -13.734375, "step": 13030 }, { "epoch": 3.4370057986294147, "grad_norm": 0.8054274674381761, "learning_rate": 1.408144438587243e-07, "logits/chosen": -0.487985223531723, "logits/rejected": -0.8128906488418579, "logps/chosen": -386.6000061035156, "logps/rejected": -459.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.853515625, "rewards/margins": 10.793749809265137, "rewards/rejected": -13.640625, "step": 13040 }, { "epoch": 3.4396415392725355, "grad_norm": 0.35776368635884875, "learning_rate": 1.4015550869794413e-07, "logits/chosen": -0.569628894329071, "logits/rejected": -0.822314441204071, "logps/chosen": -384.8999938964844, "logps/rejected": -480.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.871875047683716, "rewards/margins": 10.543749809265137, "rewards/rejected": -13.421875, "step": 13050 }, { "epoch": 3.4422772799156562, "grad_norm": 0.07164629365043584, "learning_rate": 1.3949657353716392e-07, "logits/chosen": -0.584545910358429, "logits/rejected": -0.834179699420929, "logps/chosen": -377.70001220703125, "logps/rejected": -471.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.8929686546325684, "rewards/margins": 10.690625190734863, "rewards/rejected": -13.581250190734863, "step": 13060 }, { "epoch": 3.444913020558777, "grad_norm": 0.906313871767243, "learning_rate": 1.3883763837638377e-07, "logits/chosen": -0.6031494140625, "logits/rejected": -0.87158203125, "logps/chosen": -383.8500061035156, "logps/rejected": -449.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.787109375, "rewards/margins": 10.387499809265137, "rewards/rejected": -13.165624618530273, "step": 13070 }, { "epoch": 3.447548761201898, "grad_norm": 0.23832854963689865, "learning_rate": 1.3817870321560357e-07, "logits/chosen": -0.45604246854782104, "logits/rejected": -0.8267577886581421, "logps/chosen": -445.79998779296875, "logps/rejected": -488.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.8255858421325684, "rewards/margins": 10.415624618530273, "rewards/rejected": -13.243749618530273, "step": 13080 }, { "epoch": 3.4501845018450186, "grad_norm": 17.597734029709503, "learning_rate": 1.375197680548234e-07, "logits/chosen": -0.5511474609375, "logits/rejected": -0.787353515625, "logps/chosen": -366.75, "logps/rejected": -464.95001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.758007764816284, "rewards/margins": 10.646875381469727, "rewards/rejected": -13.393750190734863, "step": 13090 }, { "epoch": 3.4528202424881393, "grad_norm": 0.17827418502807835, "learning_rate": 1.3686083289404322e-07, "logits/chosen": -0.5401611328125, "logits/rejected": -0.9559081792831421, "logps/chosen": -396.1000061035156, "logps/rejected": -483.6000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.491406202316284, "rewards/margins": 10.965624809265137, "rewards/rejected": -14.46875, "step": 13100 }, { "epoch": 3.4554559831312597, "grad_norm": 0.687970639746153, "learning_rate": 1.3620189773326304e-07, "logits/chosen": -0.682037353515625, "logits/rejected": -0.900561511516571, "logps/chosen": -366.5, "logps/rejected": -496.20001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.1292967796325684, "rewards/margins": 11.043749809265137, "rewards/rejected": -14.178125381469727, "step": 13110 }, { "epoch": 3.4580917237743805, "grad_norm": 0.3294207916810677, "learning_rate": 1.3554296257248284e-07, "logits/chosen": -0.6122192144393921, "logits/rejected": -0.779492199420929, "logps/chosen": -395.75, "logps/rejected": -478.3999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7978515625, "rewards/margins": 10.596875190734863, "rewards/rejected": -13.399999618530273, "step": 13120 }, { "epoch": 3.4607274644175012, "grad_norm": 1.0393322013069632, "learning_rate": 1.348840274117027e-07, "logits/chosen": -0.47794800996780396, "logits/rejected": -0.806347668170929, "logps/chosen": -393.25, "logps/rejected": -490.20001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.9039063453674316, "rewards/margins": 10.265625, "rewards/rejected": -13.171875, "step": 13130 }, { "epoch": 3.463363205060622, "grad_norm": 0.5337177208085736, "learning_rate": 1.3422509225092251e-07, "logits/chosen": -0.4864563047885895, "logits/rejected": -0.904248058795929, "logps/chosen": -421.20001220703125, "logps/rejected": -473.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.942578077316284, "rewards/margins": 10.303125381469727, "rewards/rejected": -13.256250381469727, "step": 13140 }, { "epoch": 3.465998945703743, "grad_norm": 0.09255886078194767, "learning_rate": 1.335661570901423e-07, "logits/chosen": -0.526293933391571, "logits/rejected": -0.789306640625, "logps/chosen": -368.6499938964844, "logps/rejected": -456.1000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.762500047683716, "rewards/margins": 10.884374618530273, "rewards/rejected": -13.637499809265137, "step": 13150 }, { "epoch": 3.4686346863468636, "grad_norm": 0.32993688728995446, "learning_rate": 1.3290722192936216e-07, "logits/chosen": -0.5584472417831421, "logits/rejected": -0.911816418170929, "logps/chosen": -418.6499938964844, "logps/rejected": -457.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.43408203125, "rewards/margins": 10.087499618530273, "rewards/rejected": -12.512499809265137, "step": 13160 }, { "epoch": 3.4712704269899843, "grad_norm": 5.7609182991668595, "learning_rate": 1.3224828676858196e-07, "logits/chosen": -0.659912109375, "logits/rejected": -1.072119116783142, "logps/chosen": -420.54998779296875, "logps/rejected": -451.0, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.837890625, "rewards/margins": 10.178125381469727, "rewards/rejected": -13.021875381469727, "step": 13170 }, { "epoch": 3.473906167633105, "grad_norm": 1.1497431607881996, "learning_rate": 1.3158935160780178e-07, "logits/chosen": -0.5016418695449829, "logits/rejected": -0.796276867389679, "logps/chosen": -400.70001220703125, "logps/rejected": -495.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.735156297683716, "rewards/margins": 10.490625381469727, "rewards/rejected": -13.21875, "step": 13180 }, { "epoch": 3.4765419082762254, "grad_norm": 0.5068091249659412, "learning_rate": 1.309304164470216e-07, "logits/chosen": -0.602832019329071, "logits/rejected": -0.861950695514679, "logps/chosen": -432.5, "logps/rejected": -499.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4347167015075684, "rewards/margins": 10.449999809265137, "rewards/rejected": -12.875, "step": 13190 }, { "epoch": 3.479177648919346, "grad_norm": 3.457312736077941, "learning_rate": 1.3027148128624143e-07, "logits/chosen": -0.5496581792831421, "logits/rejected": -0.8237549066543579, "logps/chosen": -427.75, "logps/rejected": -484.29998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0833983421325684, "rewards/margins": 10.595312118530273, "rewards/rejected": -13.6875, "step": 13200 }, { "epoch": 3.481813389562467, "grad_norm": 0.6587746059729992, "learning_rate": 1.2961254612546125e-07, "logits/chosen": -0.6817871332168579, "logits/rejected": -0.934374988079071, "logps/chosen": -374.1499938964844, "logps/rejected": -448.5, "loss": 0.0052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8355469703674316, "rewards/margins": 10.112500190734863, "rewards/rejected": -12.940625190734863, "step": 13210 }, { "epoch": 3.4844491302055878, "grad_norm": 2.2961294621424697, "learning_rate": 1.2895361096468108e-07, "logits/chosen": -0.3848938047885895, "logits/rejected": -0.7431396245956421, "logps/chosen": -356.1499938964844, "logps/rejected": -465.75, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.4735350608825684, "rewards/margins": 10.6875, "rewards/rejected": -13.171875, "step": 13220 }, { "epoch": 3.4870848708487086, "grad_norm": 1.8635688181523322, "learning_rate": 1.2829467580390088e-07, "logits/chosen": -0.353515625, "logits/rejected": -0.846728503704071, "logps/chosen": -373.1000061035156, "logps/rejected": -444.95001220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.468945264816284, "rewards/margins": 10.365625381469727, "rewards/rejected": -12.840624809265137, "step": 13230 }, { "epoch": 3.4897206114918293, "grad_norm": 2.7438146767382934, "learning_rate": 1.2763574064312073e-07, "logits/chosen": -0.6542999148368835, "logits/rejected": -0.9110351800918579, "logps/chosen": -399.8999938964844, "logps/rejected": -496.79998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.634765625, "rewards/margins": 10.649999618530273, "rewards/rejected": -13.284375190734863, "step": 13240 }, { "epoch": 3.49235635213495, "grad_norm": 0.3528387492793459, "learning_rate": 1.2697680548234052e-07, "logits/chosen": -0.47114259004592896, "logits/rejected": -0.98046875, "logps/chosen": -439.8999938964844, "logps/rejected": -473.8999938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.509765625, "rewards/margins": 10.228124618530273, "rewards/rejected": -13.737500190734863, "step": 13250 }, { "epoch": 3.4949920927780704, "grad_norm": 0.6055159361703092, "learning_rate": 1.2631787032156035e-07, "logits/chosen": -0.5061279535293579, "logits/rejected": -0.8128417730331421, "logps/chosen": -383.54998779296875, "logps/rejected": -456.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.28125, "rewards/margins": 10.375, "rewards/rejected": -13.643750190734863, "step": 13260 }, { "epoch": 3.497627833421191, "grad_norm": 1.939095481591044, "learning_rate": 1.2565893516078017e-07, "logits/chosen": -0.4294067323207855, "logits/rejected": -0.656005859375, "logps/chosen": -418.20001220703125, "logps/rejected": -535.0, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.213671922683716, "rewards/margins": 10.521875381469727, "rewards/rejected": -13.728124618530273, "step": 13270 }, { "epoch": 3.500263574064312, "grad_norm": 0.8704909990257229, "learning_rate": 1.25e-07, "logits/chosen": -0.567578136920929, "logits/rejected": -1.0237305164337158, "logps/chosen": -403.0, "logps/rejected": -457.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0953125953674316, "rewards/margins": 10.490625381469727, "rewards/rejected": -13.59375, "step": 13280 }, { "epoch": 3.5028993147074328, "grad_norm": 3.192477349085736, "learning_rate": 1.2434106483921982e-07, "logits/chosen": -0.721972644329071, "logits/rejected": -0.940234363079071, "logps/chosen": -337.45001220703125, "logps/rejected": -421.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.50390625, "rewards/margins": 10.209375381469727, "rewards/rejected": -12.706250190734863, "step": 13290 }, { "epoch": 3.5055350553505535, "grad_norm": 0.12481529676077001, "learning_rate": 1.2368212967843964e-07, "logits/chosen": -0.4202636778354645, "logits/rejected": -0.83837890625, "logps/chosen": -366.3999938964844, "logps/rejected": -487.6000061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.98828125, "rewards/margins": 10.6875, "rewards/rejected": -13.662500381469727, "step": 13300 }, { "epoch": 3.5081707959936743, "grad_norm": 0.9232656186924437, "learning_rate": 1.2302319451765947e-07, "logits/chosen": -0.6420532464981079, "logits/rejected": -0.934277355670929, "logps/chosen": -366.75, "logps/rejected": -442.5, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.180468797683716, "rewards/margins": 10.431249618530273, "rewards/rejected": -13.606249809265137, "step": 13310 }, { "epoch": 3.510806536636795, "grad_norm": 0.8072680971668093, "learning_rate": 1.2236425935687926e-07, "logits/chosen": -0.707958996295929, "logits/rejected": -0.9651733636856079, "logps/chosen": -400.3500061035156, "logps/rejected": -466.79998779296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.0896239280700684, "rewards/margins": 10.6875, "rewards/rejected": -13.774999618530273, "step": 13320 }, { "epoch": 3.513442277279916, "grad_norm": 0.02776979039095396, "learning_rate": 1.2170532419609909e-07, "logits/chosen": -0.6776367425918579, "logits/rejected": -0.917773425579071, "logps/chosen": -408.70001220703125, "logps/rejected": -490.3999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2867188453674316, "rewards/margins": 10.806249618530273, "rewards/rejected": -14.087499618530273, "step": 13330 }, { "epoch": 3.5160780179230366, "grad_norm": 1.0252004010065667, "learning_rate": 1.210463890353189e-07, "logits/chosen": -0.4881652891635895, "logits/rejected": -0.8975585699081421, "logps/chosen": -423.0, "logps/rejected": -497.20001220703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.9615235328674316, "rewards/margins": 11.065625190734863, "rewards/rejected": -14.024999618530273, "step": 13340 }, { "epoch": 3.518713758566157, "grad_norm": 0.535862115783026, "learning_rate": 1.2038745387453873e-07, "logits/chosen": -0.5518798828125, "logits/rejected": -0.833935558795929, "logps/chosen": -389.25, "logps/rejected": -440.1499938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0355467796325684, "rewards/margins": 10.481249809265137, "rewards/rejected": -13.521875381469727, "step": 13350 }, { "epoch": 3.5213494992092778, "grad_norm": 4.216290790415851, "learning_rate": 1.1972851871375856e-07, "logits/chosen": -0.5813964605331421, "logits/rejected": -0.853686511516571, "logps/chosen": -362.8999938964844, "logps/rejected": -451.54998779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.163769483566284, "rewards/margins": 10.778124809265137, "rewards/rejected": -13.946874618530273, "step": 13360 }, { "epoch": 3.5239852398523985, "grad_norm": 1.927013574937313, "learning_rate": 1.1906958355297838e-07, "logits/chosen": -0.5678421258926392, "logits/rejected": -0.866650402545929, "logps/chosen": -382.6499938964844, "logps/rejected": -455.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.2906250953674316, "rewards/margins": 10.192187309265137, "rewards/rejected": -13.490625381469727, "step": 13370 }, { "epoch": 3.5266209804955193, "grad_norm": 11.989601058989798, "learning_rate": 1.184106483921982e-07, "logits/chosen": -0.5865478515625, "logits/rejected": -0.7906738519668579, "logps/chosen": -377.3999938964844, "logps/rejected": -500.8999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.724609375, "rewards/margins": 10.096875190734863, "rewards/rejected": -12.821874618530273, "step": 13380 }, { "epoch": 3.52925672113864, "grad_norm": 0.6279204094721199, "learning_rate": 1.1775171323141803e-07, "logits/chosen": -0.41535645723342896, "logits/rejected": -0.74755859375, "logps/chosen": -398.0, "logps/rejected": -484.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.890625, "rewards/margins": 10.53125, "rewards/rejected": -13.415624618530273, "step": 13390 }, { "epoch": 3.5318924617817604, "grad_norm": 0.42586487467860057, "learning_rate": 1.1709277807063784e-07, "logits/chosen": -0.594329833984375, "logits/rejected": -0.6827392578125, "logps/chosen": -362.75, "logps/rejected": -456.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1304688453674316, "rewards/margins": 10.465624809265137, "rewards/rejected": -13.587499618530273, "step": 13400 }, { "epoch": 3.534528202424881, "grad_norm": 0.23830245676476033, "learning_rate": 1.1643384290985766e-07, "logits/chosen": -0.3566940426826477, "logits/rejected": -0.7708984613418579, "logps/chosen": -416.8999938964844, "logps/rejected": -483.8999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.259570360183716, "rewards/margins": 10.596875190734863, "rewards/rejected": -13.846875190734863, "step": 13410 }, { "epoch": 3.537163943068002, "grad_norm": 0.6185824141617025, "learning_rate": 1.1577490774907749e-07, "logits/chosen": -0.5668090581893921, "logits/rejected": -0.899462878704071, "logps/chosen": -393.45001220703125, "logps/rejected": -521.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.709765672683716, "rewards/margins": 10.918749809265137, "rewards/rejected": -14.637499809265137, "step": 13420 }, { "epoch": 3.5397996837111227, "grad_norm": 0.15384574589530015, "learning_rate": 1.1511597258829731e-07, "logits/chosen": -0.607617199420929, "logits/rejected": -0.8857421875, "logps/chosen": -404.04998779296875, "logps/rejected": -466.3999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.802539110183716, "rewards/margins": 10.368749618530273, "rewards/rejected": -13.168749809265137, "step": 13430 }, { "epoch": 3.5424354243542435, "grad_norm": 0.2483013005396663, "learning_rate": 1.1445703742751712e-07, "logits/chosen": -0.420166015625, "logits/rejected": -0.8043273687362671, "logps/chosen": -387.5, "logps/rejected": -469.3999938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.8589844703674316, "rewards/margins": 10.201562881469727, "rewards/rejected": -13.059374809265137, "step": 13440 }, { "epoch": 3.5450711649973643, "grad_norm": 0.09278155564273714, "learning_rate": 1.1379810226673695e-07, "logits/chosen": -0.34040528535842896, "logits/rejected": -0.8324218988418579, "logps/chosen": -467.70001220703125, "logps/rejected": -503.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.054760694503784, "rewards/margins": 10.354687690734863, "rewards/rejected": -13.412500381469727, "step": 13450 }, { "epoch": 3.547706905640485, "grad_norm": 4.20595734393006, "learning_rate": 1.1313916710595677e-07, "logits/chosen": -0.6411193609237671, "logits/rejected": -0.864013671875, "logps/chosen": -375.8999938964844, "logps/rejected": -494.29998779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.4046874046325684, "rewards/margins": 10.659375190734863, "rewards/rejected": -14.065625190734863, "step": 13460 }, { "epoch": 3.550342646283606, "grad_norm": 0.363357355779573, "learning_rate": 1.1248023194517658e-07, "logits/chosen": -0.594555675983429, "logits/rejected": -0.796630859375, "logps/chosen": -407.29998779296875, "logps/rejected": -448.20001220703125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.251953125, "rewards/margins": 10.953125, "rewards/rejected": -14.203125, "step": 13470 }, { "epoch": 3.5529783869267266, "grad_norm": 0.16145375263878065, "learning_rate": 1.118212967843964e-07, "logits/chosen": -0.36572265625, "logits/rejected": -0.683398425579071, "logps/chosen": -408.6499938964844, "logps/rejected": -481.6000061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.870800733566284, "rewards/margins": 10.428125381469727, "rewards/rejected": -13.296875, "step": 13480 }, { "epoch": 3.555614127569847, "grad_norm": 1.165247810648095, "learning_rate": 1.1116236162361623e-07, "logits/chosen": -0.4876464903354645, "logits/rejected": -0.897290050983429, "logps/chosen": -405.70001220703125, "logps/rejected": -494.8999938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.8179688453674316, "rewards/margins": 10.65625, "rewards/rejected": -14.475000381469727, "step": 13490 }, { "epoch": 3.5582498682129677, "grad_norm": 0.25222748209060886, "learning_rate": 1.1050342646283606e-07, "logits/chosen": -0.8238769769668579, "logits/rejected": -0.9227050542831421, "logps/chosen": -441.79998779296875, "logps/rejected": -490.5, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.2798829078674316, "rewards/margins": 10.824999809265137, "rewards/rejected": -14.103124618530273, "step": 13500 }, { "epoch": 3.5608856088560885, "grad_norm": 0.17404653818134777, "learning_rate": 1.0984449130205587e-07, "logits/chosen": -0.3455444276332855, "logits/rejected": -0.798583984375, "logps/chosen": -415.95001220703125, "logps/rejected": -455.8999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0401244163513184, "rewards/margins": 10.553125381469727, "rewards/rejected": -13.590624809265137, "step": 13510 }, { "epoch": 3.5635213494992093, "grad_norm": 1.0129350057719697, "learning_rate": 1.091855561412757e-07, "logits/chosen": -0.6031738519668579, "logits/rejected": -0.9024413824081421, "logps/chosen": -421.29998779296875, "logps/rejected": -489.8999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.6988282203674316, "rewards/margins": 11.050000190734863, "rewards/rejected": -13.756250381469727, "step": 13520 }, { "epoch": 3.56615709014233, "grad_norm": 0.4123139323587916, "learning_rate": 1.0852662098049552e-07, "logits/chosen": -0.4364013671875, "logits/rejected": -0.763378918170929, "logps/chosen": -410.79998779296875, "logps/rejected": -507.6000061035156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.036181688308716, "rewards/margins": 10.803125381469727, "rewards/rejected": -13.846875190734863, "step": 13530 }, { "epoch": 3.568792830785451, "grad_norm": 1.8878513939633288, "learning_rate": 1.0786768581971533e-07, "logits/chosen": -0.645434558391571, "logits/rejected": -0.829052746295929, "logps/chosen": -372.3500061035156, "logps/rejected": -485.70001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0728516578674316, "rewards/margins": 10.778124809265137, "rewards/rejected": -13.840624809265137, "step": 13540 }, { "epoch": 3.571428571428571, "grad_norm": 0.289527921857307, "learning_rate": 1.0720875065893516e-07, "logits/chosen": -0.6058105230331421, "logits/rejected": -0.888623058795929, "logps/chosen": -411.0, "logps/rejected": -486.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9078125953674316, "rewards/margins": 10.703125, "rewards/rejected": -13.609375, "step": 13550 }, { "epoch": 3.574064312071692, "grad_norm": 4.484184121950979, "learning_rate": 1.0654981549815498e-07, "logits/chosen": -0.5418456792831421, "logits/rejected": -0.8041015863418579, "logps/chosen": -391.75, "logps/rejected": -479.8999938964844, "loss": 0.013, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.968945264816284, "rewards/margins": 10.828125, "rewards/rejected": -13.796875, "step": 13560 }, { "epoch": 3.5767000527148127, "grad_norm": 0.8101713711736572, "learning_rate": 1.0589088033737479e-07, "logits/chosen": -0.605944812297821, "logits/rejected": -0.930957019329071, "logps/chosen": -399.6000061035156, "logps/rejected": -457.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.9419922828674316, "rewards/margins": 11.134374618530273, "rewards/rejected": -14.084375381469727, "step": 13570 }, { "epoch": 3.5793357933579335, "grad_norm": 3.837522511716644, "learning_rate": 1.0523194517659461e-07, "logits/chosen": -0.706250011920929, "logits/rejected": -1.027734398841858, "logps/chosen": -417.0, "logps/rejected": -485.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.1962890625, "rewards/margins": 10.75, "rewards/rejected": -13.946874618530273, "step": 13580 }, { "epoch": 3.5819715340010543, "grad_norm": 0.29507923830638677, "learning_rate": 1.0457301001581444e-07, "logits/chosen": -0.6381591558456421, "logits/rejected": -1.077246069908142, "logps/chosen": -384.6000061035156, "logps/rejected": -466.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.224609375, "rewards/margins": 10.490625381469727, "rewards/rejected": -13.712499618530273, "step": 13590 }, { "epoch": 3.584607274644175, "grad_norm": 0.48033316926705116, "learning_rate": 1.0391407485503426e-07, "logits/chosen": -0.61834716796875, "logits/rejected": -0.757397472858429, "logps/chosen": -430.29998779296875, "logps/rejected": -534.7000122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.12890625, "rewards/margins": 10.7890625, "rewards/rejected": -13.921875, "step": 13600 }, { "epoch": 3.587243015287296, "grad_norm": 0.6706634176630916, "learning_rate": 1.0325513969425407e-07, "logits/chosen": -0.7099853754043579, "logits/rejected": -0.9610351324081421, "logps/chosen": -422.8500061035156, "logps/rejected": -495.1000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.197265625, "rewards/margins": 10.84375, "rewards/rejected": -14.040624618530273, "step": 13610 }, { "epoch": 3.5898787559304166, "grad_norm": 1.8278405914859333, "learning_rate": 1.025962045334739e-07, "logits/chosen": -0.661791980266571, "logits/rejected": -0.869213879108429, "logps/chosen": -374.70001220703125, "logps/rejected": -461.6499938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0855469703674316, "rewards/margins": 10.498437881469727, "rewards/rejected": -13.587499618530273, "step": 13620 }, { "epoch": 3.5925144965735374, "grad_norm": 0.3278250167561455, "learning_rate": 1.0193726937269372e-07, "logits/chosen": -0.5667968988418579, "logits/rejected": -0.7638031244277954, "logps/chosen": -431.6499938964844, "logps/rejected": -498.3999938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.366406202316284, "rewards/margins": 10.796875, "rewards/rejected": -14.171875, "step": 13630 }, { "epoch": 3.5951502372166577, "grad_norm": 1.2541695565665762, "learning_rate": 1.0127833421191354e-07, "logits/chosen": -0.5910094976425171, "logits/rejected": -0.8199707269668579, "logps/chosen": -440.70001220703125, "logps/rejected": -487.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.840039014816284, "rewards/margins": 10.581250190734863, "rewards/rejected": -13.428125381469727, "step": 13640 }, { "epoch": 3.5977859778597785, "grad_norm": 0.5812336739183046, "learning_rate": 1.0061939905113337e-07, "logits/chosen": -0.5570312738418579, "logits/rejected": -0.8757568597793579, "logps/chosen": -380.3999938964844, "logps/rejected": -502.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.993359327316284, "rewards/margins": 11.324999809265137, "rewards/rejected": -14.321874618530273, "step": 13650 }, { "epoch": 3.6004217185028993, "grad_norm": 0.8509475229834391, "learning_rate": 9.996046389035319e-08, "logits/chosen": -0.548779308795929, "logits/rejected": -0.822827160358429, "logps/chosen": -400.54998779296875, "logps/rejected": -464.3999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.2890625, "rewards/margins": 10.809374809265137, "rewards/rejected": -14.096875190734863, "step": 13660 }, { "epoch": 3.60305745914602, "grad_norm": 0.6508623977434623, "learning_rate": 9.9301528729573e-08, "logits/chosen": -0.40196532011032104, "logits/rejected": -0.796093761920929, "logps/chosen": -424.1000061035156, "logps/rejected": -490.70001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.6875977516174316, "rewards/margins": 10.606249809265137, "rewards/rejected": -13.293749809265137, "step": 13670 }, { "epoch": 3.605693199789141, "grad_norm": 0.758098088569614, "learning_rate": 9.864259356879283e-08, "logits/chosen": -0.52728271484375, "logits/rejected": -0.8563903570175171, "logps/chosen": -392.79998779296875, "logps/rejected": -447.1000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.7544922828674316, "rewards/margins": 10.731249809265137, "rewards/rejected": -13.490625381469727, "step": 13680 }, { "epoch": 3.6083289404322616, "grad_norm": 0.6823624758897064, "learning_rate": 9.798365840801265e-08, "logits/chosen": -0.45078736543655396, "logits/rejected": -0.9073241949081421, "logps/chosen": -387.1000061035156, "logps/rejected": -452.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.763867139816284, "rewards/margins": 10.609375, "rewards/rejected": -13.371874809265137, "step": 13690 }, { "epoch": 3.610964681075382, "grad_norm": 34.480452593486724, "learning_rate": 9.732472324723247e-08, "logits/chosen": -0.658557116985321, "logits/rejected": -0.7962402105331421, "logps/chosen": -389.1499938964844, "logps/rejected": -459.70001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.606640577316284, "rewards/margins": 10.168749809265137, "rewards/rejected": -13.771875381469727, "step": 13700 }, { "epoch": 3.6136004217185027, "grad_norm": 0.7211971064229621, "learning_rate": 9.666578808645228e-08, "logits/chosen": -0.60595703125, "logits/rejected": -0.906445324420929, "logps/chosen": -417.5, "logps/rejected": -515.7999877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.07525634765625, "rewards/margins": 10.637499809265137, "rewards/rejected": -13.706250190734863, "step": 13710 }, { "epoch": 3.6162361623616235, "grad_norm": 0.804854214326717, "learning_rate": 9.600685292567211e-08, "logits/chosen": -0.56201171875, "logits/rejected": -0.862011730670929, "logps/chosen": -408.0, "logps/rejected": -491.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.178515672683716, "rewards/margins": 10.728124618530273, "rewards/rejected": -13.912500381469727, "step": 13720 }, { "epoch": 3.6188719030047443, "grad_norm": 0.5773539549550687, "learning_rate": 9.534791776489193e-08, "logits/chosen": -0.3632751405239105, "logits/rejected": -0.779736340045929, "logps/chosen": -432.45001220703125, "logps/rejected": -511.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0453124046325684, "rewards/margins": 11.015625, "rewards/rejected": -14.053125381469727, "step": 13730 }, { "epoch": 3.621507643647865, "grad_norm": 2.094149556330552, "learning_rate": 9.468898260411174e-08, "logits/chosen": -0.6404784917831421, "logits/rejected": -0.8670898675918579, "logps/chosen": -377.04998779296875, "logps/rejected": -480.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.7516112327575684, "rewards/margins": 10.984375, "rewards/rejected": -13.728124618530273, "step": 13740 }, { "epoch": 3.624143384290986, "grad_norm": 0.20466039068203876, "learning_rate": 9.403004744333157e-08, "logits/chosen": -0.7650390863418579, "logits/rejected": -0.8990539312362671, "logps/chosen": -392.20001220703125, "logps/rejected": -466.04998779296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.58203125, "rewards/margins": 10.278124809265137, "rewards/rejected": -13.871874809265137, "step": 13750 }, { "epoch": 3.6267791249341066, "grad_norm": 0.8788893986849959, "learning_rate": 9.337111228255139e-08, "logits/chosen": -0.612561047077179, "logits/rejected": -0.838330090045929, "logps/chosen": -378.79998779296875, "logps/rejected": -450.8999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.543750047683716, "rewards/margins": 10.778124809265137, "rewards/rejected": -13.334375381469727, "step": 13760 }, { "epoch": 3.6294148655772274, "grad_norm": 0.23139889706211447, "learning_rate": 9.27121771217712e-08, "logits/chosen": -0.61737060546875, "logits/rejected": -0.9931640625, "logps/chosen": -414.8500061035156, "logps/rejected": -469.8999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.064453125, "rewards/margins": 10.481249809265137, "rewards/rejected": -13.540624618530273, "step": 13770 }, { "epoch": 3.632050606220348, "grad_norm": 0.4699119662903808, "learning_rate": 9.205324196099104e-08, "logits/chosen": -0.5423828363418579, "logits/rejected": -0.874951183795929, "logps/chosen": -428.3500061035156, "logps/rejected": -494.29998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.6437134742736816, "rewards/margins": 10.893750190734863, "rewards/rejected": -13.546875, "step": 13780 }, { "epoch": 3.6346863468634685, "grad_norm": 4.111520582200058, "learning_rate": 9.139430680021086e-08, "logits/chosen": -0.6304687261581421, "logits/rejected": -0.895703136920929, "logps/chosen": -459.3999938964844, "logps/rejected": -517.7999877929688, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.259765625, "rewards/margins": 10.453125, "rewards/rejected": -13.721875190734863, "step": 13790 }, { "epoch": 3.6373220875065893, "grad_norm": 6.1212806581799235, "learning_rate": 9.073537163943068e-08, "logits/chosen": -0.625195324420929, "logits/rejected": -0.879833996295929, "logps/chosen": -395.3500061035156, "logps/rejected": -467.0, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.426562547683716, "rewards/margins": 10.342187881469727, "rewards/rejected": -13.778124809265137, "step": 13800 }, { "epoch": 3.63995782814971, "grad_norm": 1.1101501059323584, "learning_rate": 9.00764364786505e-08, "logits/chosen": -0.6541992425918579, "logits/rejected": -0.965258777141571, "logps/chosen": -414.29998779296875, "logps/rejected": -479.79998779296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.633984327316284, "rewards/margins": 10.3125, "rewards/rejected": -13.940625190734863, "step": 13810 }, { "epoch": 3.642593568792831, "grad_norm": 10.892494103007001, "learning_rate": 8.941750131787032e-08, "logits/chosen": -0.63299560546875, "logits/rejected": -0.7544921636581421, "logps/chosen": -383.20001220703125, "logps/rejected": -489.8999938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.247668504714966, "rewards/margins": 10.615625381469727, "rewards/rejected": -13.871874809265137, "step": 13820 }, { "epoch": 3.6452293094359516, "grad_norm": 2.457490474676304, "learning_rate": 8.875856615709014e-08, "logits/chosen": -0.4839111268520355, "logits/rejected": -0.8580566644668579, "logps/chosen": -380.3500061035156, "logps/rejected": -467.8999938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.313232421875, "rewards/margins": 10.396875381469727, "rewards/rejected": -13.696874618530273, "step": 13830 }, { "epoch": 3.6478650500790724, "grad_norm": 0.7946820631509098, "learning_rate": 8.809963099630995e-08, "logits/chosen": -0.37762451171875, "logits/rejected": -0.9125000238418579, "logps/chosen": -390.95001220703125, "logps/rejected": -427.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.215625047683716, "rewards/margins": 10.574999809265137, "rewards/rejected": -13.787500381469727, "step": 13840 }, { "epoch": 3.6505007907221927, "grad_norm": 0.2232502360757403, "learning_rate": 8.744069583552978e-08, "logits/chosen": -0.4696289002895355, "logits/rejected": -0.900634765625, "logps/chosen": -384.95001220703125, "logps/rejected": -447.1000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.0589842796325684, "rewards/margins": 10.610937118530273, "rewards/rejected": -13.678125381469727, "step": 13850 }, { "epoch": 3.6531365313653135, "grad_norm": 0.15612458277530203, "learning_rate": 8.67817606747496e-08, "logits/chosen": -0.6374267339706421, "logits/rejected": -1.0486328601837158, "logps/chosen": -385.79998779296875, "logps/rejected": -449.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.037109375, "rewards/margins": 10.290624618530273, "rewards/rejected": -14.321874618530273, "step": 13860 }, { "epoch": 3.6557722720084342, "grad_norm": 0.5779521500798321, "learning_rate": 8.612282551396942e-08, "logits/chosen": -0.741259753704071, "logits/rejected": -0.91162109375, "logps/chosen": -403.6499938964844, "logps/rejected": -488.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.6664061546325684, "rewards/margins": 11.212499618530273, "rewards/rejected": -14.881250381469727, "step": 13870 }, { "epoch": 3.658408012651555, "grad_norm": 3.6939594841769794, "learning_rate": 8.546389035318924e-08, "logits/chosen": -0.597582995891571, "logits/rejected": -0.9258788824081421, "logps/chosen": -428.1499938964844, "logps/rejected": -493.0, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.659716844558716, "rewards/margins": 11.446874618530273, "rewards/rejected": -15.100000381469727, "step": 13880 }, { "epoch": 3.661043753294676, "grad_norm": 0.23373323898150034, "learning_rate": 8.480495519240906e-08, "logits/chosen": -0.630664050579071, "logits/rejected": -0.7331787347793579, "logps/chosen": -380.5, "logps/rejected": -505.3999938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9351563453674316, "rewards/margins": 10.887499809265137, "rewards/rejected": -13.824999809265137, "step": 13890 }, { "epoch": 3.6636794939377966, "grad_norm": 1.1593131951265818, "learning_rate": 8.414602003162888e-08, "logits/chosen": -0.5799804925918579, "logits/rejected": -0.8128417730331421, "logps/chosen": -370.20001220703125, "logps/rejected": -452.79998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.08984375, "rewards/margins": 10.815625190734863, "rewards/rejected": -13.909375190734863, "step": 13900 }, { "epoch": 3.6663152345809173, "grad_norm": 0.560570548338262, "learning_rate": 8.34870848708487e-08, "logits/chosen": -0.5861450433731079, "logits/rejected": -0.852587878704071, "logps/chosen": -416.8999938964844, "logps/rejected": -477.6000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.3765625953674316, "rewards/margins": 10.509374618530273, "rewards/rejected": -13.890625, "step": 13910 }, { "epoch": 3.668950975224038, "grad_norm": 0.8219077579007759, "learning_rate": 8.282814971006853e-08, "logits/chosen": -0.542675793170929, "logits/rejected": -0.907177746295929, "logps/chosen": -351.6499938964844, "logps/rejected": -447.29998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.2164063453674316, "rewards/margins": 9.928125381469727, "rewards/rejected": -13.143750190734863, "step": 13920 }, { "epoch": 3.671586715867159, "grad_norm": 0.4013075441200207, "learning_rate": 8.216921454928835e-08, "logits/chosen": -0.545703113079071, "logits/rejected": -0.9603027105331421, "logps/chosen": -420.0, "logps/rejected": -477.20001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.760546922683716, "rewards/margins": 11.056249618530273, "rewards/rejected": -13.815625190734863, "step": 13930 }, { "epoch": 3.6742224565102792, "grad_norm": 349.37827035377995, "learning_rate": 8.151027938850816e-08, "logits/chosen": -0.492584228515625, "logits/rejected": -0.8455566167831421, "logps/chosen": -369.45001220703125, "logps/rejected": -464.0, "loss": 0.0138, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6529297828674316, "rewards/margins": 10.832812309265137, "rewards/rejected": -13.484375, "step": 13940 }, { "epoch": 3.6768581971534, "grad_norm": 4.408467265757484, "learning_rate": 8.085134422772799e-08, "logits/chosen": -0.6470702886581421, "logits/rejected": -0.9546874761581421, "logps/chosen": -403.1000061035156, "logps/rejected": -506.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.331249952316284, "rewards/margins": 10.949999809265137, "rewards/rejected": -14.278124809265137, "step": 13950 }, { "epoch": 3.679493937796521, "grad_norm": 0.8160960813171894, "learning_rate": 8.019240906694781e-08, "logits/chosen": -0.601245105266571, "logits/rejected": -0.8985351324081421, "logps/chosen": -437.04998779296875, "logps/rejected": -529.9000244140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.522656202316284, "rewards/margins": 11.134374618530273, "rewards/rejected": -14.668749809265137, "step": 13960 }, { "epoch": 3.6821296784396416, "grad_norm": 0.85073153939251, "learning_rate": 7.953347390616764e-08, "logits/chosen": -0.6625305414199829, "logits/rejected": -0.9999023675918579, "logps/chosen": -399.79998779296875, "logps/rejected": -481.70001220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.3179688453674316, "rewards/margins": 10.59375, "rewards/rejected": -13.912500381469727, "step": 13970 }, { "epoch": 3.6847654190827623, "grad_norm": 0.9717826798734855, "learning_rate": 7.887453874538745e-08, "logits/chosen": -0.3757080137729645, "logits/rejected": -0.8560546636581421, "logps/chosen": -423.3500061035156, "logps/rejected": -467.70001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.311718702316284, "rewards/margins": 10.625, "rewards/rejected": -13.934374809265137, "step": 13980 }, { "epoch": 3.687401159725883, "grad_norm": 3.186615519587164, "learning_rate": 7.821560358460727e-08, "logits/chosen": -0.5791260004043579, "logits/rejected": -0.9428466558456421, "logps/chosen": -388.29998779296875, "logps/rejected": -443.79998779296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.186718702316284, "rewards/margins": 10.4375, "rewards/rejected": -13.628125190734863, "step": 13990 }, { "epoch": 3.6900369003690034, "grad_norm": 2.4729021658123505, "learning_rate": 7.75566684238271e-08, "logits/chosen": -0.5593627691268921, "logits/rejected": -0.922290027141571, "logps/chosen": -382.6499938964844, "logps/rejected": -472.70001220703125, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.361328125, "rewards/margins": 10.659375190734863, "rewards/rejected": -14.021875381469727, "step": 14000 }, { "epoch": 3.6926726410121242, "grad_norm": 14.822899179948948, "learning_rate": 7.68977332630469e-08, "logits/chosen": -0.5715271234512329, "logits/rejected": -1.000585913658142, "logps/chosen": -416.8999938964844, "logps/rejected": -481.79998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.0414061546325684, "rewards/margins": 10.346875190734863, "rewards/rejected": -13.384374618530273, "step": 14010 }, { "epoch": 3.695308381655245, "grad_norm": 0.07877533273872481, "learning_rate": 7.623879810226673e-08, "logits/chosen": -0.583251953125, "logits/rejected": -0.8265625238418579, "logps/chosen": -435.79998779296875, "logps/rejected": -504.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.13671875, "rewards/margins": 11.215624809265137, "rewards/rejected": -14.34375, "step": 14020 }, { "epoch": 3.6979441222983658, "grad_norm": 47.48791093022184, "learning_rate": 7.557986294148655e-08, "logits/chosen": -0.48798829317092896, "logits/rejected": -0.71142578125, "logps/chosen": -412.45001220703125, "logps/rejected": -486.1000061035156, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0658202171325684, "rewards/margins": 10.756250381469727, "rewards/rejected": -13.824999809265137, "step": 14030 }, { "epoch": 3.7005798629414866, "grad_norm": 0.1833056141598741, "learning_rate": 7.492092778070638e-08, "logits/chosen": -0.6432861089706421, "logits/rejected": -0.907958984375, "logps/chosen": -424.95001220703125, "logps/rejected": -510.1000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.581249952316284, "rewards/margins": 11.178125381469727, "rewards/rejected": -14.746874809265137, "step": 14040 }, { "epoch": 3.7032156035846073, "grad_norm": 5.318370710919085, "learning_rate": 7.426199261992619e-08, "logits/chosen": -0.7028442621231079, "logits/rejected": -1.014318823814392, "logps/chosen": -370.45001220703125, "logps/rejected": -455.29998779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.4839844703674316, "rewards/margins": 10.159375190734863, "rewards/rejected": -13.646875381469727, "step": 14050 }, { "epoch": 3.705851344227728, "grad_norm": 0.2050112191169435, "learning_rate": 7.360305745914602e-08, "logits/chosen": -0.6169677972793579, "logits/rejected": -0.8377929925918579, "logps/chosen": -419.3999938964844, "logps/rejected": -493.95001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.114062547683716, "rewards/margins": 11.168749809265137, "rewards/rejected": -14.274999618530273, "step": 14060 }, { "epoch": 3.708487084870849, "grad_norm": 0.5229561600071835, "learning_rate": 7.294412229836585e-08, "logits/chosen": -0.523144543170929, "logits/rejected": -0.8949340581893921, "logps/chosen": -416.8500061035156, "logps/rejected": -486.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.03125, "rewards/margins": 10.728124618530273, "rewards/rejected": -13.768750190734863, "step": 14070 }, { "epoch": 3.7111228255139697, "grad_norm": 0.9823667188768822, "learning_rate": 7.228518713758566e-08, "logits/chosen": -0.6457763910293579, "logits/rejected": -1.0154297351837158, "logps/chosen": -442.20001220703125, "logps/rejected": -512.9000244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.8667969703674316, "rewards/margins": 10.537500381469727, "rewards/rejected": -14.412500381469727, "step": 14080 }, { "epoch": 3.71375856615709, "grad_norm": 36.154037938079775, "learning_rate": 7.162625197680548e-08, "logits/chosen": -0.584521472454071, "logits/rejected": -0.844433605670929, "logps/chosen": -401.3999938964844, "logps/rejected": -489.8999938964844, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.262011766433716, "rewards/margins": 11.162500381469727, "rewards/rejected": -14.421875, "step": 14090 }, { "epoch": 3.7163943068002108, "grad_norm": 0.35088964692614033, "learning_rate": 7.09673168160253e-08, "logits/chosen": -0.5054565668106079, "logits/rejected": -0.7036377191543579, "logps/chosen": -439.20001220703125, "logps/rejected": -510.3999938964844, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.2085938453674316, "rewards/margins": 10.78125, "rewards/rejected": -13.981249809265137, "step": 14100 }, { "epoch": 3.7190300474433315, "grad_norm": 0.4108205692293796, "learning_rate": 7.030838165524512e-08, "logits/chosen": -0.703503429889679, "logits/rejected": -1.006103515625, "logps/chosen": -409.79998779296875, "logps/rejected": -513.7000122070312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6097655296325684, "rewards/margins": 11.068750381469727, "rewards/rejected": -14.675000190734863, "step": 14110 }, { "epoch": 3.7216657880864523, "grad_norm": 12.66359504447539, "learning_rate": 6.964944649446494e-08, "logits/chosen": -0.5050293207168579, "logits/rejected": -0.908007800579071, "logps/chosen": -382.5, "logps/rejected": -459.8500061035156, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.9703125953674316, "rewards/margins": 10.595312118530273, "rewards/rejected": -14.559374809265137, "step": 14120 }, { "epoch": 3.724301528729573, "grad_norm": 1.560942051306645, "learning_rate": 6.899051133368476e-08, "logits/chosen": -0.643847644329071, "logits/rejected": -0.9517577886581421, "logps/chosen": -380.6000061035156, "logps/rejected": -456.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.5492186546325684, "rewards/margins": 10.840624809265137, "rewards/rejected": -14.396875381469727, "step": 14130 }, { "epoch": 3.726937269372694, "grad_norm": 1.4820376283400978, "learning_rate": 6.833157617290459e-08, "logits/chosen": -0.4596191346645355, "logits/rejected": -0.909497082233429, "logps/chosen": -394.04998779296875, "logps/rejected": -444.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.1470704078674316, "rewards/margins": 10.290624618530273, "rewards/rejected": -13.428125381469727, "step": 14140 }, { "epoch": 3.729573010015814, "grad_norm": 0.6157346655689292, "learning_rate": 6.76726410121244e-08, "logits/chosen": -0.713793933391571, "logits/rejected": -0.8521728515625, "logps/chosen": -406.45001220703125, "logps/rejected": -488.1000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.936718702316284, "rewards/margins": 10.581250190734863, "rewards/rejected": -13.521875381469727, "step": 14150 }, { "epoch": 3.732208750658935, "grad_norm": 0.22560558351101448, "learning_rate": 6.701370585134422e-08, "logits/chosen": -0.442626953125, "logits/rejected": -0.7556396722793579, "logps/chosen": -389.79998779296875, "logps/rejected": -495.70001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.196093797683716, "rewards/margins": 10.53125, "rewards/rejected": -13.737500190734863, "step": 14160 }, { "epoch": 3.7348444913020558, "grad_norm": 0.8173813555404668, "learning_rate": 6.635477069056405e-08, "logits/chosen": -0.6468871831893921, "logits/rejected": -1.007226586341858, "logps/chosen": -426.0, "logps/rejected": -505.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.991406202316284, "rewards/margins": 10.350000381469727, "rewards/rejected": -13.34375, "step": 14170 }, { "epoch": 3.7374802319451765, "grad_norm": 0.6477554890257848, "learning_rate": 6.569583552978386e-08, "logits/chosen": -0.3442626893520355, "logits/rejected": -0.894824206829071, "logps/chosen": -456.75, "logps/rejected": -491.5, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.668994188308716, "rewards/margins": 10.771875381469727, "rewards/rejected": -13.446874618530273, "step": 14180 }, { "epoch": 3.7401159725882973, "grad_norm": 0.9540444543491131, "learning_rate": 6.503690036900368e-08, "logits/chosen": -0.565411388874054, "logits/rejected": -0.94775390625, "logps/chosen": -392.70001220703125, "logps/rejected": -465.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.5570311546325684, "rewards/margins": 10.868749618530273, "rewards/rejected": -14.425000190734863, "step": 14190 }, { "epoch": 3.742751713231418, "grad_norm": 1.1390029756970157, "learning_rate": 6.437796520822352e-08, "logits/chosen": -0.43482667207717896, "logits/rejected": -0.960253894329071, "logps/chosen": -426.29998779296875, "logps/rejected": -471.1000061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.508496046066284, "rewards/margins": 10.631250381469727, "rewards/rejected": -13.153124809265137, "step": 14200 }, { "epoch": 3.745387453874539, "grad_norm": 2.3344799807535903, "learning_rate": 6.371903004744334e-08, "logits/chosen": -0.529345691204071, "logits/rejected": -0.927929699420929, "logps/chosen": -400.79998779296875, "logps/rejected": -455.3999938964844, "loss": 0.0068, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0875000953674316, "rewards/margins": 10.431249618530273, "rewards/rejected": -13.515625, "step": 14210 }, { "epoch": 3.7480231945176596, "grad_norm": 0.6763760654509756, "learning_rate": 6.306009488666315e-08, "logits/chosen": -0.6522216796875, "logits/rejected": -0.942187488079071, "logps/chosen": -408.6000061035156, "logps/rejected": -475.29998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.836621046066284, "rewards/margins": 11.103124618530273, "rewards/rejected": -13.934374809265137, "step": 14220 }, { "epoch": 3.7506589351607804, "grad_norm": 0.20920747795453495, "learning_rate": 6.240115972588297e-08, "logits/chosen": -0.6368163824081421, "logits/rejected": -0.9393310546875, "logps/chosen": -418.75, "logps/rejected": -458.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.650585889816284, "rewards/margins": 10.546875, "rewards/rejected": -13.196874618530273, "step": 14230 }, { "epoch": 3.7532946758039007, "grad_norm": 0.2459883412225912, "learning_rate": 6.17422245651028e-08, "logits/chosen": -0.4462646543979645, "logits/rejected": -0.815234363079071, "logps/chosen": -469.1000061035156, "logps/rejected": -552.0999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.9683594703674316, "rewards/margins": 10.709375381469727, "rewards/rejected": -13.678125381469727, "step": 14240 }, { "epoch": 3.7559304164470215, "grad_norm": 1.290860221669699, "learning_rate": 6.108328940432261e-08, "logits/chosen": -0.618359386920929, "logits/rejected": -0.786755383014679, "logps/chosen": -387.75, "logps/rejected": -471.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.0062499046325684, "rewards/margins": 10.168749809265137, "rewards/rejected": -13.178125381469727, "step": 14250 }, { "epoch": 3.7585661570901423, "grad_norm": 0.7363026738293336, "learning_rate": 6.042435424354243e-08, "logits/chosen": -0.6925293207168579, "logits/rejected": -0.8446044921875, "logps/chosen": -430.04998779296875, "logps/rejected": -476.04998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.2831053733825684, "rewards/margins": 10.109375, "rewards/rejected": -13.399999618530273, "step": 14260 }, { "epoch": 3.761201897733263, "grad_norm": 1.1771984443403933, "learning_rate": 5.976541908276226e-08, "logits/chosen": -0.6153320074081421, "logits/rejected": -0.999804675579071, "logps/chosen": -411.6499938964844, "logps/rejected": -485.70001220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.9976563453674316, "rewards/margins": 10.621874809265137, "rewards/rejected": -13.621874809265137, "step": 14270 }, { "epoch": 3.763837638376384, "grad_norm": 1.3224006401972295, "learning_rate": 5.9106483921982074e-08, "logits/chosen": -0.554699718952179, "logits/rejected": -0.9208008050918579, "logps/chosen": -421.29998779296875, "logps/rejected": -473.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7265625, "rewards/margins": 10.646875381469727, "rewards/rejected": -13.365625381469727, "step": 14280 }, { "epoch": 3.7664733790195046, "grad_norm": 2.0215003831463765, "learning_rate": 5.844754876120189e-08, "logits/chosen": -0.46287840604782104, "logits/rejected": -0.8409179449081421, "logps/chosen": -373.75, "logps/rejected": -462.25, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.0082030296325684, "rewards/margins": 10.359375, "rewards/rejected": -13.368749618530273, "step": 14290 }, { "epoch": 3.769109119662625, "grad_norm": 0.2779279103826097, "learning_rate": 5.778861360042172e-08, "logits/chosen": -0.6691039800643921, "logits/rejected": -0.849853515625, "logps/chosen": -423.29998779296875, "logps/rejected": -485.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.340527296066284, "rewards/margins": 10.693750381469727, "rewards/rejected": -13.043749809265137, "step": 14300 }, { "epoch": 3.7717448603057457, "grad_norm": 0.23644446936538802, "learning_rate": 5.712967843964154e-08, "logits/chosen": -0.709747314453125, "logits/rejected": -0.80029296875, "logps/chosen": -399.8999938964844, "logps/rejected": -473.79998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.959277391433716, "rewards/margins": 10.909375190734863, "rewards/rejected": -13.871874809265137, "step": 14310 }, { "epoch": 3.7743806009488665, "grad_norm": 0.0928079529411474, "learning_rate": 5.6470743278861356e-08, "logits/chosen": -0.50689697265625, "logits/rejected": -0.7862304449081421, "logps/chosen": -416.5, "logps/rejected": -520.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.03515625, "rewards/margins": 10.978124618530273, "rewards/rejected": -14.0, "step": 14320 }, { "epoch": 3.7770163415919873, "grad_norm": 0.2360392815252509, "learning_rate": 5.581180811808118e-08, "logits/chosen": -0.5059814453125, "logits/rejected": -0.96630859375, "logps/chosen": -442.1000061035156, "logps/rejected": -485.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.688281297683716, "rewards/margins": 10.415624618530273, "rewards/rejected": -13.106249809265137, "step": 14330 }, { "epoch": 3.779652082235108, "grad_norm": 0.06482459635666235, "learning_rate": 5.5152872957300996e-08, "logits/chosen": -0.6517089605331421, "logits/rejected": -0.87939453125, "logps/chosen": -402.29998779296875, "logps/rejected": -460.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2396483421325684, "rewards/margins": 10.456250190734863, "rewards/rejected": -13.684374809265137, "step": 14340 }, { "epoch": 3.782287822878229, "grad_norm": 0.23970526424620703, "learning_rate": 5.449393779652082e-08, "logits/chosen": -0.34556883573532104, "logits/rejected": -0.914843738079071, "logps/chosen": -435.25, "logps/rejected": -463.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.537890672683716, "rewards/margins": 11.078125, "rewards/rejected": -13.625, "step": 14350 }, { "epoch": 3.7849235635213496, "grad_norm": 6.587670503989847, "learning_rate": 5.383500263574064e-08, "logits/chosen": -0.49187010526657104, "logits/rejected": -0.8392089605331421, "logps/chosen": -406.20001220703125, "logps/rejected": -446.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.665234327316284, "rewards/margins": 10.446874618530273, "rewards/rejected": -13.112500190734863, "step": 14360 }, { "epoch": 3.7875593041644704, "grad_norm": 1.662033430735074, "learning_rate": 5.317606747496047e-08, "logits/chosen": -0.8270263671875, "logits/rejected": -0.9432617425918579, "logps/chosen": -385.95001220703125, "logps/rejected": -482.29998779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.660937547683716, "rewards/margins": 10.371874809265137, "rewards/rejected": -14.024999618530273, "step": 14370 }, { "epoch": 3.790195044807591, "grad_norm": 9.525531075682252, "learning_rate": 5.2517132314180285e-08, "logits/chosen": -0.4955291748046875, "logits/rejected": -0.971710205078125, "logps/chosen": -393.54998779296875, "logps/rejected": -467.3999938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.212890625, "rewards/margins": 11.018750190734863, "rewards/rejected": -14.225000381469727, "step": 14380 }, { "epoch": 3.7928307854507115, "grad_norm": 0.11257991739882127, "learning_rate": 5.18581971534001e-08, "logits/chosen": -0.652587890625, "logits/rejected": -0.910113513469696, "logps/chosen": -356.04998779296875, "logps/rejected": -478.1000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.2867188453674316, "rewards/margins": 10.815625190734863, "rewards/rejected": -14.100000381469727, "step": 14390 }, { "epoch": 3.7954665260938323, "grad_norm": 0.49281439206101724, "learning_rate": 5.1199261992619926e-08, "logits/chosen": -0.45161741971969604, "logits/rejected": -0.7788330316543579, "logps/chosen": -453.79998779296875, "logps/rejected": -465.95001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.888867139816284, "rewards/margins": 10.265625, "rewards/rejected": -13.15625, "step": 14400 }, { "epoch": 3.798102266736953, "grad_norm": 0.44618125676286113, "learning_rate": 5.054032683183974e-08, "logits/chosen": -0.49730223417282104, "logits/rejected": -0.8525390625, "logps/chosen": -403.79998779296875, "logps/rejected": -454.70001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.7540040016174316, "rewards/margins": 10.207812309265137, "rewards/rejected": -12.965624809265137, "step": 14410 }, { "epoch": 3.800738007380074, "grad_norm": 16.176239073944636, "learning_rate": 4.988139167105957e-08, "logits/chosen": -0.44993895292282104, "logits/rejected": -0.869140625, "logps/chosen": -385.25, "logps/rejected": -472.5, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.696093797683716, "rewards/margins": 10.309374809265137, "rewards/rejected": -14.003125190734863, "step": 14420 }, { "epoch": 3.8033737480231946, "grad_norm": 0.48852032358274017, "learning_rate": 4.9222456510279384e-08, "logits/chosen": -0.576098620891571, "logits/rejected": -0.8822265863418579, "logps/chosen": -390.8500061035156, "logps/rejected": -488.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.512890577316284, "rewards/margins": 10.615625381469727, "rewards/rejected": -14.134374618530273, "step": 14430 }, { "epoch": 3.8060094886663154, "grad_norm": 51.61903168237355, "learning_rate": 4.856352134949921e-08, "logits/chosen": -0.501171886920929, "logits/rejected": -0.711376965045929, "logps/chosen": -424.0, "logps/rejected": -481.70001220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.4832520484924316, "rewards/margins": 10.821874618530273, "rewards/rejected": -13.306249618530273, "step": 14440 }, { "epoch": 3.8086452293094357, "grad_norm": 0.3250363550575981, "learning_rate": 4.790458618871903e-08, "logits/chosen": -0.4970703125, "logits/rejected": -0.796862781047821, "logps/chosen": -405.0, "logps/rejected": -472.70001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.2457032203674316, "rewards/margins": 10.746874809265137, "rewards/rejected": -13.993749618530273, "step": 14450 }, { "epoch": 3.8112809699525565, "grad_norm": 0.24575337535722955, "learning_rate": 4.724565102793885e-08, "logits/chosen": -0.5443359613418579, "logits/rejected": -0.9405273199081421, "logps/chosen": -450.3999938964844, "logps/rejected": -466.6000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.86328125, "rewards/margins": 10.296875, "rewards/rejected": -13.162500381469727, "step": 14460 }, { "epoch": 3.8139167105956773, "grad_norm": 0.5428345961528857, "learning_rate": 4.658671586715867e-08, "logits/chosen": -0.5821777582168579, "logits/rejected": -0.9825683832168579, "logps/chosen": -423.75, "logps/rejected": -474.29998779296875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.035937547683716, "rewards/margins": 10.784375190734863, "rewards/rejected": -13.821874618530273, "step": 14470 }, { "epoch": 3.816552451238798, "grad_norm": 1.6387992692817062, "learning_rate": 4.592778070637849e-08, "logits/chosen": -0.6019042730331421, "logits/rejected": -0.8250976800918579, "logps/chosen": -394.3999938964844, "logps/rejected": -471.95001220703125, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.756054639816284, "rewards/margins": 10.393750190734863, "rewards/rejected": -13.134374618530273, "step": 14480 }, { "epoch": 3.819188191881919, "grad_norm": 0.07942246057841292, "learning_rate": 4.526884554559831e-08, "logits/chosen": -0.6000732183456421, "logits/rejected": -0.878710925579071, "logps/chosen": -380.79998779296875, "logps/rejected": -474.70001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.046875, "rewards/margins": 10.715624809265137, "rewards/rejected": -13.765625, "step": 14490 }, { "epoch": 3.8218239325250396, "grad_norm": 0.6981109887006228, "learning_rate": 4.460991038481813e-08, "logits/chosen": -0.5518554449081421, "logits/rejected": -0.9366210699081421, "logps/chosen": -379.8500061035156, "logps/rejected": -458.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.8662109375, "rewards/margins": 10.229687690734863, "rewards/rejected": -13.109375, "step": 14500 }, { "epoch": 3.8244596731681604, "grad_norm": 0.5870456065144408, "learning_rate": 4.3950975224037954e-08, "logits/chosen": -0.79296875, "logits/rejected": -1.0403320789337158, "logps/chosen": -403.79998779296875, "logps/rejected": -484.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.346484422683716, "rewards/margins": 10.506250381469727, "rewards/rejected": -13.856249809265137, "step": 14510 }, { "epoch": 3.827095413811281, "grad_norm": 2.797816788495577, "learning_rate": 4.329204006325778e-08, "logits/chosen": -0.32415771484375, "logits/rejected": -0.796551525592804, "logps/chosen": -429.3999938964844, "logps/rejected": -497.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.40234375, "rewards/margins": 10.978124618530273, "rewards/rejected": -14.387499809265137, "step": 14520 }, { "epoch": 3.829731154454402, "grad_norm": 0.10792834345043384, "learning_rate": 4.2633104902477595e-08, "logits/chosen": -0.4458862245082855, "logits/rejected": -0.708447277545929, "logps/chosen": -419.29998779296875, "logps/rejected": -526.2000122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.090625047683716, "rewards/margins": 10.9375, "rewards/rejected": -14.024999618530273, "step": 14530 }, { "epoch": 3.8323668950975223, "grad_norm": 1.2632120024252629, "learning_rate": 4.197416974169741e-08, "logits/chosen": -0.704394519329071, "logits/rejected": -0.8788086175918579, "logps/chosen": -436.3999938964844, "logps/rejected": -523.2999877929688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.551562547683716, "rewards/margins": 10.823437690734863, "rewards/rejected": -14.381250381469727, "step": 14540 }, { "epoch": 3.835002635740643, "grad_norm": 0.9795049621208458, "learning_rate": 4.1315234580917236e-08, "logits/chosen": -0.624755859375, "logits/rejected": -0.973925769329071, "logps/chosen": -378.79998779296875, "logps/rejected": -488.29998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.393359422683716, "rewards/margins": 10.389062881469727, "rewards/rejected": -13.771875381469727, "step": 14550 }, { "epoch": 3.837638376383764, "grad_norm": 0.550304478511046, "learning_rate": 4.065629942013705e-08, "logits/chosen": -0.5059570074081421, "logits/rejected": -0.9208984375, "logps/chosen": -423.20001220703125, "logps/rejected": -510.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.5347657203674316, "rewards/margins": 10.240625381469727, "rewards/rejected": -13.778124809265137, "step": 14560 }, { "epoch": 3.8402741170268846, "grad_norm": 0.22999161555789216, "learning_rate": 3.999736425935688e-08, "logits/chosen": -0.568450927734375, "logits/rejected": -0.880419909954071, "logps/chosen": -398.70001220703125, "logps/rejected": -470.29998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.190624952316284, "rewards/margins": 10.315625190734863, "rewards/rejected": -13.506250381469727, "step": 14570 }, { "epoch": 3.8429098576700054, "grad_norm": 1.5882290564110702, "learning_rate": 3.93384290985767e-08, "logits/chosen": -0.830273449420929, "logits/rejected": -0.9335082769393921, "logps/chosen": -404.04998779296875, "logps/rejected": -481.54998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.3480467796325684, "rewards/margins": 10.181249618530273, "rewards/rejected": -13.53125, "step": 14580 }, { "epoch": 3.845545598313126, "grad_norm": 0.43824490800678423, "learning_rate": 3.8679493937796525e-08, "logits/chosen": -0.7349182367324829, "logits/rejected": -0.916699230670929, "logps/chosen": -425.3999938964844, "logps/rejected": -496.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.5933594703674316, "rewards/margins": 11.012499809265137, "rewards/rejected": -14.600000381469727, "step": 14590 }, { "epoch": 3.8481813389562465, "grad_norm": 16.516273294997358, "learning_rate": 3.802055877701634e-08, "logits/chosen": -0.419677734375, "logits/rejected": -0.792553722858429, "logps/chosen": -389.79998779296875, "logps/rejected": -492.20001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.09765625, "rewards/margins": 10.348437309265137, "rewards/rejected": -13.446874618530273, "step": 14600 }, { "epoch": 3.8508170795993673, "grad_norm": 1.0484862384738736, "learning_rate": 3.736162361623616e-08, "logits/chosen": -0.520581066608429, "logits/rejected": -0.924121081829071, "logps/chosen": -396.1000061035156, "logps/rejected": -464.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.981250047683716, "rewards/margins": 10.875, "rewards/rejected": -13.862500190734863, "step": 14610 }, { "epoch": 3.853452820242488, "grad_norm": 1.9934900460959009, "learning_rate": 3.670268845545598e-08, "logits/chosen": -0.636962890625, "logits/rejected": -0.9248046875, "logps/chosen": -441.75, "logps/rejected": -459.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.225781202316284, "rewards/margins": 10.321874618530273, "rewards/rejected": -13.534375190734863, "step": 14620 }, { "epoch": 3.856088560885609, "grad_norm": 0.5617014545261722, "learning_rate": 3.60437532946758e-08, "logits/chosen": -0.4155517518520355, "logits/rejected": -0.8062499761581421, "logps/chosen": -390.1000061035156, "logps/rejected": -468.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.9037108421325684, "rewards/margins": 10.837499618530273, "rewards/rejected": -13.743749618530273, "step": 14630 }, { "epoch": 3.8587243015287296, "grad_norm": 27.06408137287053, "learning_rate": 3.5384818133895624e-08, "logits/chosen": -0.550488293170929, "logits/rejected": -0.688671886920929, "logps/chosen": -412.6499938964844, "logps/rejected": -495.95001220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.165234327316284, "rewards/margins": 10.806249618530273, "rewards/rejected": -13.981249809265137, "step": 14640 }, { "epoch": 3.8613600421718504, "grad_norm": 2.482929759460326, "learning_rate": 3.472588297311544e-08, "logits/chosen": -0.5501343011856079, "logits/rejected": -0.8197265863418579, "logps/chosen": -420.29998779296875, "logps/rejected": -528.5999755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.8636717796325684, "rewards/margins": 10.896875381469727, "rewards/rejected": -14.753125190734863, "step": 14650 }, { "epoch": 3.863995782814971, "grad_norm": 0.1975872874026488, "learning_rate": 3.4066947812335265e-08, "logits/chosen": -0.6411377191543579, "logits/rejected": -0.9356445074081421, "logps/chosen": -386.75, "logps/rejected": -439.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.746875047683716, "rewards/margins": 10.240625381469727, "rewards/rejected": -12.987500190734863, "step": 14660 }, { "epoch": 3.866631523458092, "grad_norm": 0.38495904851205903, "learning_rate": 3.340801265155509e-08, "logits/chosen": -0.682812511920929, "logits/rejected": -0.934277355670929, "logps/chosen": -404.20001220703125, "logps/rejected": -476.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.022265672683716, "rewards/margins": 10.887499809265137, "rewards/rejected": -13.918749809265137, "step": 14670 }, { "epoch": 3.8692672641012127, "grad_norm": 0.18145268451909685, "learning_rate": 3.2749077490774905e-08, "logits/chosen": -0.7625732421875, "logits/rejected": -0.824658215045929, "logps/chosen": -393.45001220703125, "logps/rejected": -529.0999755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.585205078125, "rewards/margins": 11.168749809265137, "rewards/rejected": -13.753125190734863, "step": 14680 }, { "epoch": 3.871903004744333, "grad_norm": 0.6766148982667218, "learning_rate": 3.209014232999473e-08, "logits/chosen": -0.7481445074081421, "logits/rejected": -0.99462890625, "logps/chosen": -367.95001220703125, "logps/rejected": -429.3999938964844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.564502000808716, "rewards/margins": 10.4453125, "rewards/rejected": -13.009374618530273, "step": 14690 }, { "epoch": 3.874538745387454, "grad_norm": 7.890904310019645, "learning_rate": 3.1431207169214546e-08, "logits/chosen": -0.6301513910293579, "logits/rejected": -0.878710925579071, "logps/chosen": -378.6499938964844, "logps/rejected": -470.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.4164061546325684, "rewards/margins": 10.918749809265137, "rewards/rejected": -14.331250190734863, "step": 14700 }, { "epoch": 3.8771744860305746, "grad_norm": 0.15255634603534463, "learning_rate": 3.077227200843437e-08, "logits/chosen": -0.6181396245956421, "logits/rejected": -0.951367199420929, "logps/chosen": -421.70001220703125, "logps/rejected": -489.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.501953125, "rewards/margins": 10.234375, "rewards/rejected": -13.737500190734863, "step": 14710 }, { "epoch": 3.8798102266736954, "grad_norm": 0.19423353042625613, "learning_rate": 3.011333684765419e-08, "logits/chosen": -0.466552734375, "logits/rejected": -0.948046863079071, "logps/chosen": -384.20001220703125, "logps/rejected": -500.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.080078125, "rewards/margins": 11.103124618530273, "rewards/rejected": -14.184374809265137, "step": 14720 }, { "epoch": 3.882445967316816, "grad_norm": 0.8530596530434138, "learning_rate": 2.9454401686874008e-08, "logits/chosen": -0.3672851622104645, "logits/rejected": -0.719738781452179, "logps/chosen": -440.29998779296875, "logps/rejected": -484.5, "loss": 0.0052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8115234375, "rewards/margins": 10.596875190734863, "rewards/rejected": -13.40625, "step": 14730 }, { "epoch": 3.885081707959937, "grad_norm": 1.2583366291226534, "learning_rate": 2.879546652609383e-08, "logits/chosen": -0.5755065679550171, "logits/rejected": -0.995898425579071, "logps/chosen": -434.6000061035156, "logps/rejected": -511.29998779296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.6312499046325684, "rewards/margins": 10.699999809265137, "rewards/rejected": -14.34375, "step": 14740 }, { "epoch": 3.8877174486030572, "grad_norm": 0.6902043665856644, "learning_rate": 2.8136531365313652e-08, "logits/chosen": -0.5609985589981079, "logits/rejected": -0.8791748285293579, "logps/chosen": -401.1000061035156, "logps/rejected": -479.79998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.095996141433716, "rewards/margins": 10.615625381469727, "rewards/rejected": -13.696874618530273, "step": 14750 }, { "epoch": 3.890353189246178, "grad_norm": 0.0554031326003888, "learning_rate": 2.7477596204533472e-08, "logits/chosen": -0.59381103515625, "logits/rejected": -0.908538818359375, "logps/chosen": -415.29998779296875, "logps/rejected": -486.70001220703125, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.419140577316284, "rewards/margins": 10.868749618530273, "rewards/rejected": -14.290624618530273, "step": 14760 }, { "epoch": 3.892988929889299, "grad_norm": 3.5669847253505997, "learning_rate": 2.6818661043753293e-08, "logits/chosen": -0.39763182401657104, "logits/rejected": -0.8432861566543579, "logps/chosen": -417.6499938964844, "logps/rejected": -497.6000061035156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.1695313453674316, "rewards/margins": 10.518750190734863, "rewards/rejected": -13.690625190734863, "step": 14770 }, { "epoch": 3.8956246705324196, "grad_norm": 0.740231344151168, "learning_rate": 2.6159725882973117e-08, "logits/chosen": -0.6180664300918579, "logits/rejected": -0.8880859613418579, "logps/chosen": -388.8500061035156, "logps/rejected": -455.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0609374046325684, "rewards/margins": 10.399999618530273, "rewards/rejected": -13.462499618530273, "step": 14780 }, { "epoch": 3.8982604111755403, "grad_norm": 0.10660772458110516, "learning_rate": 2.5500790722192934e-08, "logits/chosen": -0.604870617389679, "logits/rejected": -0.941943347454071, "logps/chosen": -436.0, "logps/rejected": -492.70001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.1988282203674316, "rewards/margins": 10.734375, "rewards/rejected": -13.925000190734863, "step": 14790 }, { "epoch": 3.900896151818661, "grad_norm": 0.45793123642763683, "learning_rate": 2.4841855561412754e-08, "logits/chosen": -0.618115246295929, "logits/rejected": -0.937304675579071, "logps/chosen": -438.5, "logps/rejected": -497.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.0472168922424316, "rewards/margins": 10.875, "rewards/rejected": -13.928125381469727, "step": 14800 }, { "epoch": 3.903531892461782, "grad_norm": 0.11863846920947585, "learning_rate": 2.4182920400632578e-08, "logits/chosen": -0.660839855670929, "logits/rejected": -0.931835949420929, "logps/chosen": -379.0, "logps/rejected": -478.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0933594703674316, "rewards/margins": 10.703125, "rewards/rejected": -13.784375190734863, "step": 14810 }, { "epoch": 3.9061676331049027, "grad_norm": 1.3117080500992122, "learning_rate": 2.35239852398524e-08, "logits/chosen": -0.601733386516571, "logits/rejected": -0.895495593547821, "logps/chosen": -402.6000061035156, "logps/rejected": -488.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.853710889816284, "rewards/margins": 10.609375, "rewards/rejected": -13.462499618530273, "step": 14820 }, { "epoch": 3.9088033737480234, "grad_norm": 0.9302677646863057, "learning_rate": 2.286505007907222e-08, "logits/chosen": -0.4129882752895355, "logits/rejected": -0.8980468511581421, "logps/chosen": -435.1000061035156, "logps/rejected": -521.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.5914063453674316, "rewards/margins": 11.034375190734863, "rewards/rejected": -14.621874809265137, "step": 14830 }, { "epoch": 3.911439114391144, "grad_norm": 3.5176499019708327, "learning_rate": 2.2206114918292036e-08, "logits/chosen": -0.647778332233429, "logits/rejected": -1.001367211341858, "logps/chosen": -388.95001220703125, "logps/rejected": -465.3999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.458691358566284, "rewards/margins": 10.475000381469727, "rewards/rejected": -13.925000190734863, "step": 14840 }, { "epoch": 3.9140748550342646, "grad_norm": 1.8152174675610646, "learning_rate": 2.154717975751186e-08, "logits/chosen": -0.691699206829071, "logits/rejected": -0.909375011920929, "logps/chosen": -425.6000061035156, "logps/rejected": -478.20001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7649903297424316, "rewards/margins": 10.729687690734863, "rewards/rejected": -13.487500190734863, "step": 14850 }, { "epoch": 3.9167105956773853, "grad_norm": 0.18566226385201712, "learning_rate": 2.088824459673168e-08, "logits/chosen": -0.586132824420929, "logits/rejected": -0.926464855670929, "logps/chosen": -424.1000061035156, "logps/rejected": -476.0, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.08984375, "rewards/margins": 10.265625, "rewards/rejected": -13.353124618530273, "step": 14860 }, { "epoch": 3.919346336320506, "grad_norm": 0.413545235800586, "learning_rate": 2.02293094359515e-08, "logits/chosen": -0.720837414264679, "logits/rejected": -1.017187476158142, "logps/chosen": -405.6000061035156, "logps/rejected": -494.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.360156297683716, "rewards/margins": 10.768750190734863, "rewards/rejected": -14.128125190734863, "step": 14870 }, { "epoch": 3.921982076963627, "grad_norm": 1.4101882739863236, "learning_rate": 1.9570374275171325e-08, "logits/chosen": -0.5394836664199829, "logits/rejected": -0.778076171875, "logps/chosen": -415.75, "logps/rejected": -506.29998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.253124952316284, "rewards/margins": 10.746874809265137, "rewards/rejected": -13.996874809265137, "step": 14880 }, { "epoch": 3.9246178176067477, "grad_norm": 0.8270306842190701, "learning_rate": 1.8911439114391145e-08, "logits/chosen": -0.522167980670929, "logits/rejected": -0.9932616949081421, "logps/chosen": -424.1000061035156, "logps/rejected": -460.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.123828172683716, "rewards/margins": 10.732812881469727, "rewards/rejected": -13.853124618530273, "step": 14890 }, { "epoch": 3.927253558249868, "grad_norm": 3.2516220360756307, "learning_rate": 1.8252503953610962e-08, "logits/chosen": -0.617626965045929, "logits/rejected": -0.701995849609375, "logps/chosen": -399.1499938964844, "logps/rejected": -473.6000061035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.00146484375, "rewards/margins": 10.815625190734863, "rewards/rejected": -13.803125381469727, "step": 14900 }, { "epoch": 3.9298892988929888, "grad_norm": 5.199326109257858, "learning_rate": 1.7593568792830783e-08, "logits/chosen": -0.574511706829071, "logits/rejected": -0.906506359577179, "logps/chosen": -406.54998779296875, "logps/rejected": -491.6000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.200390577316284, "rewards/margins": 10.690625190734863, "rewards/rejected": -13.896875381469727, "step": 14910 }, { "epoch": 3.9325250395361095, "grad_norm": 0.17540673038379953, "learning_rate": 1.6934633632050606e-08, "logits/chosen": -0.681591808795929, "logits/rejected": -0.9747070074081421, "logps/chosen": -361.3500061035156, "logps/rejected": -469.20001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.5250000953674316, "rewards/margins": 10.75, "rewards/rejected": -14.278124809265137, "step": 14920 }, { "epoch": 3.9351607801792303, "grad_norm": 0.2230458131433391, "learning_rate": 1.6275698471270427e-08, "logits/chosen": -0.5697265863418579, "logits/rejected": -0.7056640386581421, "logps/chosen": -382.75, "logps/rejected": -475.6000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.337109327316284, "rewards/margins": 10.53125, "rewards/rejected": -13.868749618530273, "step": 14930 }, { "epoch": 3.937796520822351, "grad_norm": 0.23710141593570042, "learning_rate": 1.5616763310490247e-08, "logits/chosen": -0.620288074016571, "logits/rejected": -0.994921863079071, "logps/chosen": -389.70001220703125, "logps/rejected": -469.5, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.310742139816284, "rewards/margins": 10.9375, "rewards/rejected": -14.243749618530273, "step": 14940 }, { "epoch": 3.940432261465472, "grad_norm": 0.4044332355598266, "learning_rate": 1.4957828149710068e-08, "logits/chosen": -0.556225597858429, "logits/rejected": -0.983691394329071, "logps/chosen": -338.75, "logps/rejected": -408.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8714842796325684, "rewards/margins": 10.559374809265137, "rewards/rejected": -13.425000190734863, "step": 14950 }, { "epoch": 3.9430680021085927, "grad_norm": 0.46465718956751406, "learning_rate": 1.429889298892989e-08, "logits/chosen": -0.7412109375, "logits/rejected": -0.905810534954071, "logps/chosen": -354.3500061035156, "logps/rejected": -436.1000061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.966601610183716, "rewards/margins": 10.681249618530273, "rewards/rejected": -13.649999618530273, "step": 14960 }, { "epoch": 3.9457037427517134, "grad_norm": 2.4733207712090888, "learning_rate": 1.3639957828149709e-08, "logits/chosen": -0.581494152545929, "logits/rejected": -0.832202136516571, "logps/chosen": -454.29998779296875, "logps/rejected": -533.2999877929688, "loss": 0.0097, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.114550828933716, "rewards/margins": 10.490625381469727, "rewards/rejected": -13.603124618530273, "step": 14970 }, { "epoch": 3.948339483394834, "grad_norm": 1.5747277078073207, "learning_rate": 1.2981022667369531e-08, "logits/chosen": -0.527539074420929, "logits/rejected": -0.888671875, "logps/chosen": -423.70001220703125, "logps/rejected": -480.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.4404296875, "rewards/margins": 10.578125, "rewards/rejected": -14.012499809265137, "step": 14980 }, { "epoch": 3.9509752240379545, "grad_norm": 1.217021372729792, "learning_rate": 1.232208750658935e-08, "logits/chosen": -0.6524413824081421, "logits/rejected": -0.9308105707168579, "logps/chosen": -391.20001220703125, "logps/rejected": -463.1000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.46484375, "rewards/margins": 10.759374618530273, "rewards/rejected": -14.225000381469727, "step": 14990 }, { "epoch": 3.9536109646810753, "grad_norm": 1.5085353687951455, "learning_rate": 1.1663152345809172e-08, "logits/chosen": -0.645312488079071, "logits/rejected": -0.960644543170929, "logps/chosen": -400.20001220703125, "logps/rejected": -470.3999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.2701172828674316, "rewards/margins": 10.699999809265137, "rewards/rejected": -13.971875190734863, "step": 15000 }, { "epoch": 3.956246705324196, "grad_norm": 0.4021908445547856, "learning_rate": 1.1004217185028992e-08, "logits/chosen": -0.4552001953125, "logits/rejected": -0.869824230670929, "logps/chosen": -390.3500061035156, "logps/rejected": -451.25, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.783398389816284, "rewards/margins": 11.475000381469727, "rewards/rejected": -14.262499809265137, "step": 15010 }, { "epoch": 3.958882445967317, "grad_norm": 0.20841711734893115, "learning_rate": 1.0345282024248813e-08, "logits/chosen": -0.4139160215854645, "logits/rejected": -0.916796863079071, "logps/chosen": -383.79998779296875, "logps/rejected": -463.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9350342750549316, "rewards/margins": 10.509374618530273, "rewards/rejected": -13.443750381469727, "step": 15020 }, { "epoch": 3.9615181866104376, "grad_norm": 0.1442469602279184, "learning_rate": 9.686346863468635e-09, "logits/chosen": -0.666516125202179, "logits/rejected": -0.863574206829071, "logps/chosen": -466.45001220703125, "logps/rejected": -502.70001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.153515577316284, "rewards/margins": 10.649999618530273, "rewards/rejected": -13.800000190734863, "step": 15030 }, { "epoch": 3.964153927253558, "grad_norm": 0.4334231954429962, "learning_rate": 9.027411702688455e-09, "logits/chosen": -0.5501708984375, "logits/rejected": -0.8324218988418579, "logps/chosen": -411.3999938964844, "logps/rejected": -472.79998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.207812547683716, "rewards/margins": 10.537500381469727, "rewards/rejected": -13.746874809265137, "step": 15040 }, { "epoch": 3.9667896678966788, "grad_norm": 3.285101327147521, "learning_rate": 8.368476541908276e-09, "logits/chosen": -0.5959228277206421, "logits/rejected": -0.7786620855331421, "logps/chosen": -388.6499938964844, "logps/rejected": -480.79998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.184765577316284, "rewards/margins": 10.956250190734863, "rewards/rejected": -14.143750190734863, "step": 15050 }, { "epoch": 3.9694254085397995, "grad_norm": 3.480107768755036, "learning_rate": 7.709541381128096e-09, "logits/chosen": -0.8079833984375, "logits/rejected": -0.935839831829071, "logps/chosen": -365.79998779296875, "logps/rejected": -434.54998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.225390672683716, "rewards/margins": 10.15625, "rewards/rejected": -13.381250381469727, "step": 15060 }, { "epoch": 3.9720611491829203, "grad_norm": 0.25564341054773654, "learning_rate": 7.0506062203479176e-09, "logits/chosen": -0.6243041753768921, "logits/rejected": -0.9132324457168579, "logps/chosen": -386.25, "logps/rejected": -448.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.4527344703674316, "rewards/margins": 10.443750381469727, "rewards/rejected": -13.899999618530273, "step": 15070 }, { "epoch": 3.974696889826041, "grad_norm": 0.40383631291115907, "learning_rate": 6.391671059567738e-09, "logits/chosen": -0.6392822265625, "logits/rejected": -0.993359386920929, "logps/chosen": -385.6499938964844, "logps/rejected": -486.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.131640672683716, "rewards/margins": 11.143750190734863, "rewards/rejected": -14.265625, "step": 15080 }, { "epoch": 3.977332630469162, "grad_norm": 0.5222953001273609, "learning_rate": 5.732735898787559e-09, "logits/chosen": -0.7318969964981079, "logits/rejected": -0.8912109136581421, "logps/chosen": -402.0, "logps/rejected": -487.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.1656250953674316, "rewards/margins": 10.324999809265137, "rewards/rejected": -13.487500190734863, "step": 15090 }, { "epoch": 3.9799683711122826, "grad_norm": 0.18538684042054357, "learning_rate": 5.07380073800738e-09, "logits/chosen": -0.42083740234375, "logits/rejected": -0.862353503704071, "logps/chosen": -384.8999938964844, "logps/rejected": -491.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.201367139816284, "rewards/margins": 10.762499809265137, "rewards/rejected": -13.946874618530273, "step": 15100 }, { "epoch": 3.9826041117554034, "grad_norm": 6.169713649341421, "learning_rate": 4.4148655772272e-09, "logits/chosen": -0.6341552734375, "logits/rejected": -0.8174804449081421, "logps/chosen": -415.6000061035156, "logps/rejected": -487.29998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.900195360183716, "rewards/margins": 10.701562881469727, "rewards/rejected": -13.600000381469727, "step": 15110 }, { "epoch": 3.985239852398524, "grad_norm": 2.3683112206847405, "learning_rate": 3.7559304164470215e-09, "logits/chosen": -0.7322174310684204, "logits/rejected": -1.055761694908142, "logps/chosen": -403.8500061035156, "logps/rejected": -464.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.3828125, "rewards/margins": 10.506250381469727, "rewards/rejected": -13.893750190734863, "step": 15120 }, { "epoch": 3.987875593041645, "grad_norm": 1.0355901140119081, "learning_rate": 3.096995255666842e-09, "logits/chosen": -0.426025390625, "logits/rejected": -0.913378894329071, "logps/chosen": -403.6000061035156, "logps/rejected": -458.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.9068360328674316, "rewards/margins": 11.090624809265137, "rewards/rejected": -13.996874809265137, "step": 15130 }, { "epoch": 3.9905113336847653, "grad_norm": 2.0950128260883165, "learning_rate": 2.4380600948866633e-09, "logits/chosen": -0.5411132574081421, "logits/rejected": -0.8126465082168579, "logps/chosen": -398.29998779296875, "logps/rejected": -445.29998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.9886717796325684, "rewards/margins": 9.926562309265137, "rewards/rejected": -12.931249618530273, "step": 15140 }, { "epoch": 3.993147074327886, "grad_norm": 0.35721359132654984, "learning_rate": 1.779124934106484e-09, "logits/chosen": -0.6378539800643921, "logits/rejected": -0.869921863079071, "logps/chosen": -442.20001220703125, "logps/rejected": -531.2999877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.3082032203674316, "rewards/margins": 10.625, "rewards/rejected": -13.928125381469727, "step": 15150 }, { "epoch": 3.995782814971007, "grad_norm": 0.5931348396902296, "learning_rate": 1.1201897733263046e-09, "logits/chosen": -0.49028319120407104, "logits/rejected": -0.8805176019668579, "logps/chosen": -372.95001220703125, "logps/rejected": -470.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.9183592796325684, "rewards/margins": 10.643750190734863, "rewards/rejected": -13.565625190734863, "step": 15160 }, { "epoch": 3.9984185556141276, "grad_norm": 0.39088030333665, "learning_rate": 4.612546125461254e-10, "logits/chosen": -0.616528332233429, "logits/rejected": -0.92529296875, "logps/chosen": -400.8500061035156, "logps/rejected": -468.70001220703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.912109375, "rewards/margins": 10.5078125, "rewards/rejected": -13.412500381469727, "step": 15170 } ], "logging_steps": 10, "max_steps": 15176, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }