{ "best_global_step": 27000, "best_metric": 0.024163657799363136, "best_model_checkpoint": "trainer_output/checkpoint-27000", "epoch": 3.0, "eval_steps": 1000, "global_step": 34020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6916703507304192, "epoch": 8.818536563857227e-05, "grad_norm": 15.375, "learning_rate": 0.0, "loss": 0.5625149607658386, "mean_token_accuracy": 0.8633110001683235, "num_tokens": 1333.0, "step": 1 }, { "entropy": 0.5527553335608294, "epoch": 0.002204634140964307, "grad_norm": 4.09375, "learning_rate": 4.701273261508325e-06, "loss": 0.4002639452616374, "mean_token_accuracy": 0.8936887547994653, "num_tokens": 26348.0, "step": 25 }, { "entropy": 0.15097233657725156, "epoch": 0.004409268281928614, "grad_norm": 3.046875, "learning_rate": 9.598432908912831e-06, "loss": 0.1661180877685547, "mean_token_accuracy": 0.9615221408009529, "num_tokens": 52419.0, "step": 50 }, { "entropy": 0.12569273573113604, "epoch": 0.006613902422892921, "grad_norm": 3.140625, "learning_rate": 1.4495592556317337e-05, "loss": 0.15316170692443848, "mean_token_accuracy": 0.9623415350914002, "num_tokens": 79151.0, "step": 75 }, { "entropy": 0.10584518638905138, "epoch": 0.008818536563857228, "grad_norm": 2.78125, "learning_rate": 1.9392752203721843e-05, "loss": 0.1182499122619629, "mean_token_accuracy": 0.972541985809803, "num_tokens": 105057.0, "step": 100 }, { "entropy": 0.08121318614110351, "epoch": 0.011023170704821535, "grad_norm": 4.25, "learning_rate": 2.428991185112635e-05, "loss": 0.10249818801879883, "mean_token_accuracy": 0.9750828278064728, "num_tokens": 131925.0, "step": 125 }, { "entropy": 0.0871189395734109, "epoch": 0.013227804845785842, "grad_norm": 1.8828125, "learning_rate": 2.9187071498530854e-05, "loss": 0.11519923210144042, "mean_token_accuracy": 0.9748159709572792, "num_tokens": 158543.0, "step": 150 }, { "entropy": 0.0863258442163351, "epoch": 0.015432438986750148, "grad_norm": 2.703125, "learning_rate": 3.408423114593536e-05, "loss": 0.11372153282165527, "mean_token_accuracy": 0.9735086694359779, "num_tokens": 185289.0, "step": 175 }, { "entropy": 0.06604936945135705, "epoch": 0.017637073127714457, "grad_norm": 2.359375, "learning_rate": 3.8981390793339866e-05, "loss": 0.08210546493530274, "mean_token_accuracy": 0.9802239489555359, "num_tokens": 210755.0, "step": 200 }, { "entropy": 0.08488065572455525, "epoch": 0.019841707268678763, "grad_norm": 1.0, "learning_rate": 4.387855044074437e-05, "loss": 0.09681215286254882, "mean_token_accuracy": 0.9729325622320175, "num_tokens": 236918.0, "step": 225 }, { "entropy": 0.08888421370531432, "epoch": 0.02204634140964307, "grad_norm": 2.5, "learning_rate": 4.877571008814887e-05, "loss": 0.12263742446899414, "mean_token_accuracy": 0.9723492181301117, "num_tokens": 263894.0, "step": 250 }, { "entropy": 0.081377989416942, "epoch": 0.024250975550607377, "grad_norm": 2.15625, "learning_rate": 5.367286973555338e-05, "loss": 0.10161479949951172, "mean_token_accuracy": 0.9720952340960503, "num_tokens": 289789.0, "step": 275 }, { "entropy": 0.0684462244680617, "epoch": 0.026455609691571683, "grad_norm": 1.96875, "learning_rate": 5.857002938295789e-05, "loss": 0.08587800979614257, "mean_token_accuracy": 0.9774800541996956, "num_tokens": 315300.0, "step": 300 }, { "entropy": 0.08871725924720522, "epoch": 0.02866024383253599, "grad_norm": 1.1640625, "learning_rate": 6.346718903036239e-05, "loss": 0.10993636131286622, "mean_token_accuracy": 0.9720117238163948, "num_tokens": 341328.0, "step": 325 }, { "entropy": 0.06815292302024317, "epoch": 0.030864877973500297, "grad_norm": 1.7890625, "learning_rate": 6.83643486777669e-05, "loss": 0.09177898406982422, "mean_token_accuracy": 0.9783439686894417, "num_tokens": 367118.0, "step": 350 }, { "entropy": 0.08387046866118908, "epoch": 0.03306951211446461, "grad_norm": 0.419921875, "learning_rate": 7.32615083251714e-05, "loss": 0.11903386116027832, "mean_token_accuracy": 0.9728097701072693, "num_tokens": 393500.0, "step": 375 }, { "entropy": 0.08728675234102411, "epoch": 0.03527414625542891, "grad_norm": 1.859375, "learning_rate": 7.81586679725759e-05, "loss": 0.12638781547546388, "mean_token_accuracy": 0.9733547866344452, "num_tokens": 419948.0, "step": 400 }, { "entropy": 0.09208631370478543, "epoch": 0.03747878039639322, "grad_norm": 2.484375, "learning_rate": 8.305582761998043e-05, "loss": 0.13350732803344725, "mean_token_accuracy": 0.9699632921814918, "num_tokens": 446845.0, "step": 425 }, { "entropy": 0.10303217243927065, "epoch": 0.03968341453735753, "grad_norm": 3.65625, "learning_rate": 8.795298726738492e-05, "loss": 0.15950148582458495, "mean_token_accuracy": 0.966689610183239, "num_tokens": 474184.0, "step": 450 }, { "entropy": 0.10971505879715551, "epoch": 0.04188804867832183, "grad_norm": 1.90625, "learning_rate": 9.285014691478942e-05, "loss": 0.14567254066467286, "mean_token_accuracy": 0.9634886494278908, "num_tokens": 502243.0, "step": 475 }, { "entropy": 0.1019335692783352, "epoch": 0.04409268281928614, "grad_norm": 1.71875, "learning_rate": 9.774730656219394e-05, "loss": 0.12678200721740723, "mean_token_accuracy": 0.9698567974567414, "num_tokens": 528691.0, "step": 500 }, { "entropy": 0.11618134506978095, "epoch": 0.04629731696025045, "grad_norm": 2.484375, "learning_rate": 0.00010264446620959845, "loss": 0.1609012222290039, "mean_token_accuracy": 0.9640674701333046, "num_tokens": 557262.0, "step": 525 }, { "entropy": 0.08170637517177966, "epoch": 0.04850195110121475, "grad_norm": 3.671875, "learning_rate": 0.00010754162585700293, "loss": 0.11303130149841309, "mean_token_accuracy": 0.9746711283922196, "num_tokens": 583864.0, "step": 550 }, { "entropy": 0.08663091063848696, "epoch": 0.05070658524217906, "grad_norm": 1.7578125, "learning_rate": 0.00011243878550440745, "loss": 0.11758679389953614, "mean_token_accuracy": 0.9716430401802063, "num_tokens": 609742.0, "step": 575 }, { "entropy": 0.09322622302162926, "epoch": 0.05291121938314337, "grad_norm": 3.5, "learning_rate": 0.00011733594515181196, "loss": 0.14786581039428712, "mean_token_accuracy": 0.9670667290687561, "num_tokens": 635773.0, "step": 600 }, { "entropy": 0.12020757110440172, "epoch": 0.05511585352410767, "grad_norm": 2.46875, "learning_rate": 0.00012223310479921644, "loss": 0.1722104072570801, "mean_token_accuracy": 0.963900217115879, "num_tokens": 662038.0, "step": 625 }, { "entropy": 0.11677032057195902, "epoch": 0.05732048766507198, "grad_norm": 15.625, "learning_rate": 0.00012713026444662097, "loss": 0.14971059799194336, "mean_token_accuracy": 0.9657747402787209, "num_tokens": 687244.0, "step": 650 }, { "entropy": 0.11523487624712288, "epoch": 0.05952512180603629, "grad_norm": 1.9375, "learning_rate": 0.00013202742409402547, "loss": 0.1585102081298828, "mean_token_accuracy": 0.9673979789018631, "num_tokens": 714382.0, "step": 675 }, { "entropy": 0.11896388848195784, "epoch": 0.06172975594700059, "grad_norm": 3.5, "learning_rate": 0.00013692458374142997, "loss": 0.1676181411743164, "mean_token_accuracy": 0.9640780803561211, "num_tokens": 740592.0, "step": 700 }, { "entropy": 0.09957129561458715, "epoch": 0.0639343900879649, "grad_norm": 4.15625, "learning_rate": 0.0001418217433888345, "loss": 0.14969416618347167, "mean_token_accuracy": 0.9703717887401581, "num_tokens": 766145.0, "step": 725 }, { "entropy": 0.11554073191975477, "epoch": 0.06613902422892921, "grad_norm": 3.453125, "learning_rate": 0.000146718903036239, "loss": 0.15943048477172853, "mean_token_accuracy": 0.9683528250455856, "num_tokens": 792413.0, "step": 750 }, { "entropy": 0.11423591443977785, "epoch": 0.06834365836989352, "grad_norm": 9.25, "learning_rate": 0.0001516160626836435, "loss": 0.16937288284301757, "mean_token_accuracy": 0.962159241437912, "num_tokens": 818187.0, "step": 775 }, { "entropy": 0.15337594370008445, "epoch": 0.07054829251085783, "grad_norm": 2.53125, "learning_rate": 0.000156513222331048, "loss": 0.19895185470581056, "mean_token_accuracy": 0.9605051165819168, "num_tokens": 845825.0, "step": 800 }, { "entropy": 0.10168401733972132, "epoch": 0.07275292665182213, "grad_norm": 4.34375, "learning_rate": 0.00016141038197845252, "loss": 0.14234644889831544, "mean_token_accuracy": 0.9697119271755219, "num_tokens": 870399.0, "step": 825 }, { "entropy": 0.13999531235196627, "epoch": 0.07495756079278644, "grad_norm": 3.109375, "learning_rate": 0.00016630754162585702, "loss": 0.19501604080200197, "mean_token_accuracy": 0.959660935997963, "num_tokens": 897210.0, "step": 850 }, { "entropy": 0.1450615050335182, "epoch": 0.07716219493375075, "grad_norm": 7.125, "learning_rate": 0.00017120470127326152, "loss": 0.19098827362060547, "mean_token_accuracy": 0.9595730248093605, "num_tokens": 923017.0, "step": 875 }, { "entropy": 0.1744435393344611, "epoch": 0.07936682907471505, "grad_norm": 4.6875, "learning_rate": 0.00017610186092066602, "loss": 0.22231653213500976, "mean_token_accuracy": 0.947464411854744, "num_tokens": 950476.0, "step": 900 }, { "entropy": 0.13871764447540044, "epoch": 0.08157146321567936, "grad_norm": 2.75, "learning_rate": 0.00018099902056807051, "loss": 0.2051361083984375, "mean_token_accuracy": 0.9618028599023819, "num_tokens": 976674.0, "step": 925 }, { "entropy": 0.11772402749746107, "epoch": 0.08377609735664367, "grad_norm": 2.578125, "learning_rate": 0.00018589618021547504, "loss": 0.1542353343963623, "mean_token_accuracy": 0.9612125977873802, "num_tokens": 1002875.0, "step": 950 }, { "entropy": 0.14306295619462617, "epoch": 0.08598073149760797, "grad_norm": 4.75, "learning_rate": 0.00019079333986287954, "loss": 0.2011384391784668, "mean_token_accuracy": 0.9559961825609207, "num_tokens": 1028393.0, "step": 975 }, { "entropy": 0.1746880966401659, "epoch": 0.08818536563857228, "grad_norm": 4.71875, "learning_rate": 0.00019569049951028404, "loss": 0.24061763763427735, "mean_token_accuracy": 0.9488253518939018, "num_tokens": 1053692.0, "step": 1000 }, { "epoch": 0.08818536563857228, "eval_entropy": 0.09399272183322045, "eval_loss": 0.09840720146894455, "eval_mean_token_accuracy": 0.9736589030234762, "eval_num_tokens": 1053692.0, "eval_runtime": 245.793, "eval_samples_per_second": 15.985, "eval_steps_per_second": 3.999, "step": 1000 }, { "entropy": 0.13214532324811443, "epoch": 0.09038999977953659, "grad_norm": 4.75, "learning_rate": 0.00019999999592140393, "loss": 0.17850215911865233, "mean_token_accuracy": 0.9571027851104736, "num_tokens": 1079429.0, "step": 1025 }, { "entropy": 0.16548033393686637, "epoch": 0.0925946339205009, "grad_norm": 4.65625, "learning_rate": 0.0001999996447091715, "loss": 0.22275726318359376, "mean_token_accuracy": 0.9530195724964142, "num_tokens": 1106553.0, "step": 1050 }, { "entropy": 0.14543476202059538, "epoch": 0.0947992680614652, "grad_norm": 5.21875, "learning_rate": 0.00019999872702753714, "loss": 0.180413875579834, "mean_token_accuracy": 0.9578671759366989, "num_tokens": 1132889.0, "step": 1075 }, { "entropy": 0.13837194723077118, "epoch": 0.0970039022024295, "grad_norm": 10.0625, "learning_rate": 0.00019999724288169928, "loss": 0.1911659622192383, "mean_token_accuracy": 0.9591153275966644, "num_tokens": 1159859.0, "step": 1100 }, { "entropy": 0.15544078743841966, "epoch": 0.09920853634339381, "grad_norm": 2.578125, "learning_rate": 0.00019999519228006515, "loss": 0.2121676254272461, "mean_token_accuracy": 0.9535513919591904, "num_tokens": 1187029.0, "step": 1125 }, { "entropy": 0.15761214319965802, "epoch": 0.10141317048435812, "grad_norm": 3.609375, "learning_rate": 0.00019999257523425088, "loss": 0.23544374465942383, "mean_token_accuracy": 0.9549354958534241, "num_tokens": 1212489.0, "step": 1150 }, { "entropy": 0.14129603882553055, "epoch": 0.10361780462532243, "grad_norm": 7.53125, "learning_rate": 0.00019998939175908126, "loss": 0.18428075790405274, "mean_token_accuracy": 0.9605149912834168, "num_tokens": 1239902.0, "step": 1175 }, { "entropy": 0.19539385408628732, "epoch": 0.10582243876628673, "grad_norm": 7.21875, "learning_rate": 0.00019998564187258974, "loss": 0.30669073104858396, "mean_token_accuracy": 0.9401221024990082, "num_tokens": 1267180.0, "step": 1200 }, { "entropy": 0.17899971085134894, "epoch": 0.10802707290725104, "grad_norm": 90.0, "learning_rate": 0.0001999813255960184, "loss": 0.2703672981262207, "mean_token_accuracy": 0.9464505657553672, "num_tokens": 1295006.0, "step": 1225 }, { "entropy": 0.15225840135361068, "epoch": 0.11023170704821535, "grad_norm": 5.09375, "learning_rate": 0.00019997644295381765, "loss": 0.18838932037353515, "mean_token_accuracy": 0.9539206418395042, "num_tokens": 1321227.0, "step": 1250 }, { "entropy": 0.16457684379653073, "epoch": 0.11243634118917965, "grad_norm": 17.875, "learning_rate": 0.0001999709939736463, "loss": 0.22236486434936523, "mean_token_accuracy": 0.9525096288323402, "num_tokens": 1348546.0, "step": 1275 }, { "entropy": 0.1539011792768724, "epoch": 0.11464097533014396, "grad_norm": 3.828125, "learning_rate": 0.00019996497868637132, "loss": 0.20238460540771486, "mean_token_accuracy": 0.9542306599020958, "num_tokens": 1374669.0, "step": 1300 }, { "entropy": 0.20363085102522746, "epoch": 0.11684560947110827, "grad_norm": 4.65625, "learning_rate": 0.0001999583971260675, "loss": 0.28179990768432617, "mean_token_accuracy": 0.941902932524681, "num_tokens": 1400529.0, "step": 1325 }, { "entropy": 0.1503680677805096, "epoch": 0.11905024361207257, "grad_norm": 15.3125, "learning_rate": 0.00019995124933001754, "loss": 0.22546863555908203, "mean_token_accuracy": 0.9548282194137573, "num_tokens": 1428330.0, "step": 1350 }, { "entropy": 0.13919373288343195, "epoch": 0.12125487775303688, "grad_norm": 5.15625, "learning_rate": 0.00019994353533871165, "loss": 0.1785750961303711, "mean_token_accuracy": 0.9592090272903442, "num_tokens": 1454023.0, "step": 1375 }, { "entropy": 0.1594487912580371, "epoch": 0.12345951189400119, "grad_norm": 3.1875, "learning_rate": 0.0001999352551958474, "loss": 0.22408544540405273, "mean_token_accuracy": 0.9546812137961388, "num_tokens": 1480660.0, "step": 1400 }, { "entropy": 0.12862279892200604, "epoch": 0.1256641460349655, "grad_norm": 4.65625, "learning_rate": 0.00019992640894832945, "loss": 0.1851462936401367, "mean_token_accuracy": 0.9622556710243225, "num_tokens": 1507742.0, "step": 1425 }, { "entropy": 0.12973011572903487, "epoch": 0.1278687801759298, "grad_norm": 13.4375, "learning_rate": 0.00019991699664626924, "loss": 0.182869815826416, "mean_token_accuracy": 0.9545741581916809, "num_tokens": 1534303.0, "step": 1450 }, { "entropy": 0.1497259267186746, "epoch": 0.13007341431689412, "grad_norm": 8.8125, "learning_rate": 0.00019990701834298475, "loss": 0.22879138946533203, "mean_token_accuracy": 0.9533333089947701, "num_tokens": 1560598.0, "step": 1475 }, { "entropy": 0.18277295206673444, "epoch": 0.13227804845785843, "grad_norm": 7.90625, "learning_rate": 0.00019989647409500024, "loss": 0.21807186126708986, "mean_token_accuracy": 0.9539013829827309, "num_tokens": 1587377.0, "step": 1500 }, { "entropy": 0.1608945361687802, "epoch": 0.13448268259882273, "grad_norm": 6.90625, "learning_rate": 0.00019988536396204585, "loss": 0.2067807388305664, "mean_token_accuracy": 0.9522431689500809, "num_tokens": 1614095.0, "step": 1525 }, { "entropy": 0.16176950700290035, "epoch": 0.13668731673978704, "grad_norm": 8.6875, "learning_rate": 0.00019987368800705732, "loss": 0.21591096878051758, "mean_token_accuracy": 0.9503604331612587, "num_tokens": 1639568.0, "step": 1550 }, { "entropy": 0.19014478008728475, "epoch": 0.13889195088075135, "grad_norm": 20.625, "learning_rate": 0.0001998614462961756, "loss": 0.2421278953552246, "mean_token_accuracy": 0.9487079519033432, "num_tokens": 1664964.0, "step": 1575 }, { "entropy": 0.17077196488622576, "epoch": 0.14109658502171565, "grad_norm": 7.84375, "learning_rate": 0.00019984863889874646, "loss": 0.2088161277770996, "mean_token_accuracy": 0.9499969807267189, "num_tokens": 1691979.0, "step": 1600 }, { "entropy": 0.1300421412358992, "epoch": 0.14330121916267996, "grad_norm": 10.875, "learning_rate": 0.00019983526588732015, "loss": 0.18558111190795898, "mean_token_accuracy": 0.9627196085453034, "num_tokens": 1717606.0, "step": 1625 }, { "entropy": 0.15682672704104333, "epoch": 0.14550585330364427, "grad_norm": 13.375, "learning_rate": 0.000199821327337651, "loss": 0.199498291015625, "mean_token_accuracy": 0.9547498086094857, "num_tokens": 1743479.0, "step": 1650 }, { "entropy": 0.13432513456908055, "epoch": 0.14771048744460857, "grad_norm": 6.65625, "learning_rate": 0.00019980682332869685, "loss": 0.17353324890136718, "mean_token_accuracy": 0.959072844684124, "num_tokens": 1769361.0, "step": 1675 }, { "entropy": 0.16332185944309457, "epoch": 0.14991512158557288, "grad_norm": 8.75, "learning_rate": 0.0001997917539426188, "loss": 0.2064606285095215, "mean_token_accuracy": 0.9575307658314705, "num_tokens": 1796448.0, "step": 1700 }, { "entropy": 0.1739040635782294, "epoch": 0.1521197557265372, "grad_norm": 5.90625, "learning_rate": 0.00019977611926478062, "loss": 0.21098520278930663, "mean_token_accuracy": 0.954549497961998, "num_tokens": 1822567.0, "step": 1725 }, { "entropy": 0.159052537526004, "epoch": 0.1543243898675015, "grad_norm": 21.125, "learning_rate": 0.00019975991938374826, "loss": 0.2069353485107422, "mean_token_accuracy": 0.9509221604466438, "num_tokens": 1850030.0, "step": 1750 }, { "entropy": 0.15716556440107524, "epoch": 0.1565290240084658, "grad_norm": 8.375, "learning_rate": 0.00019974315439128944, "loss": 0.19329355239868165, "mean_token_accuracy": 0.9546804228425025, "num_tokens": 1876869.0, "step": 1775 }, { "entropy": 0.1525935530057177, "epoch": 0.1587336581494301, "grad_norm": 6.09375, "learning_rate": 0.000199725824382373, "loss": 0.20368234634399415, "mean_token_accuracy": 0.9573848494887351, "num_tokens": 1902705.0, "step": 1800 }, { "entropy": 0.14915346608497201, "epoch": 0.1609382922903944, "grad_norm": 12.3125, "learning_rate": 0.0001997079294551686, "loss": 0.20571292877197267, "mean_token_accuracy": 0.9549185973405838, "num_tokens": 1929332.0, "step": 1825 }, { "entropy": 0.1540890626120381, "epoch": 0.16314292643135872, "grad_norm": 7.96875, "learning_rate": 0.00019968946971104577, "loss": 0.2072519302368164, "mean_token_accuracy": 0.9564824342727661, "num_tokens": 1956884.0, "step": 1850 }, { "entropy": 0.14470923049142584, "epoch": 0.16534756057232303, "grad_norm": 7.125, "learning_rate": 0.00019967044525457373, "loss": 0.17775110244750977, "mean_token_accuracy": 0.9551727205514908, "num_tokens": 1982453.0, "step": 1875 }, { "entropy": 0.15138954945839941, "epoch": 0.16755219471328733, "grad_norm": 9.375, "learning_rate": 0.0001996508561935206, "loss": 0.18922266006469726, "mean_token_accuracy": 0.9547863042354584, "num_tokens": 2009831.0, "step": 1900 }, { "entropy": 0.14155943776480853, "epoch": 0.16975682885425164, "grad_norm": 5.25, "learning_rate": 0.0001996307026388528, "loss": 0.20168161392211914, "mean_token_accuracy": 0.9573139929771424, "num_tokens": 2035463.0, "step": 1925 }, { "entropy": 0.13731083187274634, "epoch": 0.17196146299521595, "grad_norm": 5.59375, "learning_rate": 0.00019960998470473445, "loss": 0.20148942947387696, "mean_token_accuracy": 0.9581686609983444, "num_tokens": 2061128.0, "step": 1950 }, { "entropy": 0.18538200524868442, "epoch": 0.17416609713618025, "grad_norm": 6.1875, "learning_rate": 0.0001995887025085267, "loss": 0.2421389961242676, "mean_token_accuracy": 0.9522135049104691, "num_tokens": 2088131.0, "step": 1975 }, { "entropy": 0.1542595046805218, "epoch": 0.17637073127714456, "grad_norm": 13.4375, "learning_rate": 0.00019956685617078716, "loss": 0.17982059478759765, "mean_token_accuracy": 0.9567183205485343, "num_tokens": 2113958.0, "step": 2000 }, { "epoch": 0.17637073127714456, "eval_entropy": 0.12889170395125116, "eval_loss": 0.14472714066505432, "eval_mean_token_accuracy": 0.9669632775252373, "eval_num_tokens": 2113958.0, "eval_runtime": 230.2268, "eval_samples_per_second": 17.066, "eval_steps_per_second": 4.27, "step": 2000 }, { "entropy": 0.17109735576668755, "epoch": 0.17857536541810887, "grad_norm": 15.625, "learning_rate": 0.00019954444581526907, "loss": 0.18888046264648437, "mean_token_accuracy": 0.960274083018303, "num_tokens": 2139315.0, "step": 2025 }, { "entropy": 0.2150292192818597, "epoch": 0.18077999955907317, "grad_norm": 3.125, "learning_rate": 0.0001995214715689207, "loss": 0.24926740646362305, "mean_token_accuracy": 0.9541423147916794, "num_tokens": 2165555.0, "step": 2050 }, { "entropy": 0.14501372674596497, "epoch": 0.18298463370003748, "grad_norm": 4.90625, "learning_rate": 0.00019949793356188454, "loss": 0.20213737487792968, "mean_token_accuracy": 0.9615710428357125, "num_tokens": 2191462.0, "step": 2075 }, { "entropy": 0.1310080810985528, "epoch": 0.1851892678410018, "grad_norm": 4.8125, "learning_rate": 0.00019947383192749668, "loss": 0.18873476028442382, "mean_token_accuracy": 0.9583966609835625, "num_tokens": 2216189.0, "step": 2100 }, { "entropy": 0.1508674483350478, "epoch": 0.1873939019819661, "grad_norm": 13.1875, "learning_rate": 0.00019944916680228608, "loss": 0.18426620483398437, "mean_token_accuracy": 0.956863840520382, "num_tokens": 2241622.0, "step": 2125 }, { "entropy": 0.19120873935054988, "epoch": 0.1895985361229304, "grad_norm": 9.125, "learning_rate": 0.0001994239383259735, "loss": 0.2455206298828125, "mean_token_accuracy": 0.9497691139578819, "num_tokens": 2268247.0, "step": 2150 }, { "entropy": 0.13146985903033057, "epoch": 0.1918031702638947, "grad_norm": 16.375, "learning_rate": 0.00019939814664147112, "loss": 0.19511665344238283, "mean_token_accuracy": 0.9607363530993461, "num_tokens": 2293502.0, "step": 2175 }, { "entropy": 0.1565953103499487, "epoch": 0.194007804404859, "grad_norm": 9.8125, "learning_rate": 0.00019937179189488146, "loss": 0.21843402862548827, "mean_token_accuracy": 0.9554175850749016, "num_tokens": 2318791.0, "step": 2200 }, { "entropy": 0.19252748679718934, "epoch": 0.19621243854582332, "grad_norm": 6.40625, "learning_rate": 0.00019934487423549656, "loss": 0.2356930160522461, "mean_token_accuracy": 0.9517645919322968, "num_tokens": 2345980.0, "step": 2225 }, { "entropy": 0.14177053164923564, "epoch": 0.19841707268678763, "grad_norm": 9.5, "learning_rate": 0.00019931739381579737, "loss": 0.20377872467041017, "mean_token_accuracy": 0.9597736677527428, "num_tokens": 2371597.0, "step": 2250 }, { "entropy": 0.1750909099751152, "epoch": 0.20062170682775193, "grad_norm": 2.84375, "learning_rate": 0.00019928935079145254, "loss": 0.19352615356445313, "mean_token_accuracy": 0.9594111356139183, "num_tokens": 2397749.0, "step": 2275 }, { "entropy": 0.1871402624528855, "epoch": 0.20282634096871624, "grad_norm": 9.4375, "learning_rate": 0.00019926074532131778, "loss": 0.23113348007202147, "mean_token_accuracy": 0.9543161234259605, "num_tokens": 2423692.0, "step": 2300 }, { "entropy": 0.15049185859272257, "epoch": 0.20503097510968055, "grad_norm": 4.5, "learning_rate": 0.00019923157756743492, "loss": 0.18978424072265626, "mean_token_accuracy": 0.9580981060862541, "num_tokens": 2449112.0, "step": 2325 }, { "entropy": 0.16552068044547924, "epoch": 0.20723560925064485, "grad_norm": 4.8125, "learning_rate": 0.00019920184769503096, "loss": 0.23359138488769532, "mean_token_accuracy": 0.9517715626955032, "num_tokens": 2475135.0, "step": 2350 }, { "entropy": 0.17032419282593764, "epoch": 0.20944024339160916, "grad_norm": 13.5625, "learning_rate": 0.00019917155587251712, "loss": 0.20869092941284179, "mean_token_accuracy": 0.9506035950779915, "num_tokens": 2502512.0, "step": 2375 }, { "entropy": 0.15413983556907623, "epoch": 0.21164487753257347, "grad_norm": 8.6875, "learning_rate": 0.00019914070227148795, "loss": 0.20199440002441407, "mean_token_accuracy": 0.9544082489609719, "num_tokens": 2528612.0, "step": 2400 }, { "entropy": 0.20244524533161892, "epoch": 0.21384951167353777, "grad_norm": 7.34375, "learning_rate": 0.00019910928706672022, "loss": 0.26269786834716796, "mean_token_accuracy": 0.9423273745179176, "num_tokens": 2555625.0, "step": 2425 }, { "entropy": 0.16426463149138726, "epoch": 0.21605414581450208, "grad_norm": 3.90625, "learning_rate": 0.0001990773104361721, "loss": 0.2108094596862793, "mean_token_accuracy": 0.953407633304596, "num_tokens": 2581347.0, "step": 2450 }, { "entropy": 0.18900560716865583, "epoch": 0.2182587799554664, "grad_norm": 5.96875, "learning_rate": 0.0001990447725609821, "loss": 0.23020565032958984, "mean_token_accuracy": 0.9502732402086258, "num_tokens": 2607478.0, "step": 2475 }, { "entropy": 0.14198036040528678, "epoch": 0.2204634140964307, "grad_norm": 6.75, "learning_rate": 0.0001990116736254679, "loss": 0.17339960098266602, "mean_token_accuracy": 0.9606603449583053, "num_tokens": 2634683.0, "step": 2500 }, { "entropy": 0.16489507200894876, "epoch": 0.222668048237395, "grad_norm": 8.75, "learning_rate": 0.00019897801381712563, "loss": 0.22926589965820313, "mean_token_accuracy": 0.9573606929183006, "num_tokens": 2662560.0, "step": 2525 }, { "entropy": 0.14484784842003137, "epoch": 0.2248726823783593, "grad_norm": 8.125, "learning_rate": 0.00019894379332662836, "loss": 0.20587156295776368, "mean_token_accuracy": 0.9606982052326203, "num_tokens": 2689760.0, "step": 2550 }, { "entropy": 0.1534224282670766, "epoch": 0.2270773165193236, "grad_norm": 6.75, "learning_rate": 0.0001989090123478255, "loss": 0.21122034072875975, "mean_token_accuracy": 0.9542035666108132, "num_tokens": 2715288.0, "step": 2575 }, { "entropy": 0.16028283250052483, "epoch": 0.22928195066028792, "grad_norm": 4.625, "learning_rate": 0.00019887367107774125, "loss": 0.20950069427490234, "mean_token_accuracy": 0.9586973160505294, "num_tokens": 2740557.0, "step": 2600 }, { "entropy": 0.18298680773936213, "epoch": 0.23148658480125223, "grad_norm": 8.3125, "learning_rate": 0.00019883776971657384, "loss": 0.2443247604370117, "mean_token_accuracy": 0.9503592163324356, "num_tokens": 2767449.0, "step": 2625 }, { "entropy": 0.1627288061589934, "epoch": 0.23369121894221653, "grad_norm": 12.375, "learning_rate": 0.00019880130846769425, "loss": 0.20207910537719725, "mean_token_accuracy": 0.953170590698719, "num_tokens": 2792767.0, "step": 2650 }, { "entropy": 0.15129281114554033, "epoch": 0.23589585308318084, "grad_norm": 10.8125, "learning_rate": 0.000198764287537645, "loss": 0.19894697189331054, "mean_token_accuracy": 0.9556801280379296, "num_tokens": 2819173.0, "step": 2675 }, { "entropy": 0.13785832320805638, "epoch": 0.23810048722414515, "grad_norm": 8.3125, "learning_rate": 0.00019872670713613907, "loss": 0.16127649307250977, "mean_token_accuracy": 0.9632558959722519, "num_tokens": 2844158.0, "step": 2700 }, { "entropy": 0.17196971918223425, "epoch": 0.24030512136510945, "grad_norm": 13.875, "learning_rate": 0.00019868856747605872, "loss": 0.22303541183471678, "mean_token_accuracy": 0.9536935070157051, "num_tokens": 2870212.0, "step": 2725 }, { "entropy": 0.14516899693524465, "epoch": 0.24250975550607376, "grad_norm": 10.5, "learning_rate": 0.0001986498687734542, "loss": 0.18350645065307616, "mean_token_accuracy": 0.9615237146615982, "num_tokens": 2896169.0, "step": 2750 }, { "entropy": 0.19394321402534842, "epoch": 0.24471438964703807, "grad_norm": 4.71875, "learning_rate": 0.00019861061124754262, "loss": 0.23362028121948242, "mean_token_accuracy": 0.9451329198479652, "num_tokens": 2924429.0, "step": 2775 }, { "entropy": 0.1593242084258236, "epoch": 0.24691902378800237, "grad_norm": 36.0, "learning_rate": 0.00019857079512070663, "loss": 0.2111642074584961, "mean_token_accuracy": 0.9549560195207596, "num_tokens": 2949725.0, "step": 2800 }, { "entropy": 0.14827293824637308, "epoch": 0.24912365792896668, "grad_norm": 8.75, "learning_rate": 0.00019853042061849317, "loss": 0.18694892883300782, "mean_token_accuracy": 0.9584823083877564, "num_tokens": 2975320.0, "step": 2825 }, { "entropy": 0.14805610009469092, "epoch": 0.251328292069931, "grad_norm": 4.8125, "learning_rate": 0.00019848948796961233, "loss": 0.1870688819885254, "mean_token_accuracy": 0.9585736668109894, "num_tokens": 3000789.0, "step": 2850 }, { "entropy": 0.1488057920546271, "epoch": 0.2535329262108953, "grad_norm": 2.78125, "learning_rate": 0.00019844799740593582, "loss": 0.2043631935119629, "mean_token_accuracy": 0.9576079681515693, "num_tokens": 3027263.0, "step": 2875 }, { "entropy": 0.17447588493814692, "epoch": 0.2557375603518596, "grad_norm": 5.09375, "learning_rate": 0.0001984059491624958, "loss": 0.22025297164916993, "mean_token_accuracy": 0.9506224057078362, "num_tokens": 3053711.0, "step": 2900 }, { "entropy": 0.1623274303250946, "epoch": 0.2579421944928239, "grad_norm": 3.96875, "learning_rate": 0.00019836334347748358, "loss": 0.19379936218261717, "mean_token_accuracy": 0.9566253125667572, "num_tokens": 3080007.0, "step": 2925 }, { "entropy": 0.16122146823909134, "epoch": 0.26014682863378824, "grad_norm": 9.375, "learning_rate": 0.0001983201805922482, "loss": 0.22965421676635742, "mean_token_accuracy": 0.9526374578475952, "num_tokens": 3107320.0, "step": 2950 }, { "entropy": 0.20952579901786522, "epoch": 0.2623514627747525, "grad_norm": 10.875, "learning_rate": 0.00019827646075129502, "loss": 0.26936729431152345, "mean_token_accuracy": 0.9549751174449921, "num_tokens": 3133692.0, "step": 2975 }, { "entropy": 0.145962798174005, "epoch": 0.26455609691571685, "grad_norm": 9.9375, "learning_rate": 0.0001982321842022845, "loss": 0.19020818710327148, "mean_token_accuracy": 0.9561607944965362, "num_tokens": 3160508.0, "step": 3000 }, { "epoch": 0.26455609691571685, "eval_entropy": 0.11402115050896562, "eval_loss": 0.09617123752832413, "eval_mean_token_accuracy": 0.9735903109320546, "eval_num_tokens": 3160508.0, "eval_runtime": 229.2109, "eval_samples_per_second": 17.141, "eval_steps_per_second": 4.289, "step": 3000 }, { "entropy": 0.1764632138889283, "epoch": 0.26676073105668113, "grad_norm": 5.15625, "learning_rate": 0.0001981873511960306, "loss": 0.22552785873413086, "mean_token_accuracy": 0.9531935697793961, "num_tokens": 3188120.0, "step": 3025 }, { "entropy": 0.14757733151782304, "epoch": 0.26896536519764547, "grad_norm": 7.96875, "learning_rate": 0.00019814196198649948, "loss": 0.22005025863647462, "mean_token_accuracy": 0.9574960014224052, "num_tokens": 3214224.0, "step": 3050 }, { "entropy": 0.13798468225868418, "epoch": 0.27116999933860975, "grad_norm": 5.5625, "learning_rate": 0.00019809601683080805, "loss": 0.18730186462402343, "mean_token_accuracy": 0.9637488493323326, "num_tokens": 3240698.0, "step": 3075 }, { "entropy": 0.1589315331657417, "epoch": 0.2733746334795741, "grad_norm": 6.9375, "learning_rate": 0.0001980495159892225, "loss": 0.19896188735961914, "mean_token_accuracy": 0.9575601083040237, "num_tokens": 3265159.0, "step": 3100 }, { "entropy": 0.17124603053322063, "epoch": 0.27557926762053836, "grad_norm": 3.84375, "learning_rate": 0.00019800245972515675, "loss": 0.22188325881958007, "mean_token_accuracy": 0.9544751027226448, "num_tokens": 3293130.0, "step": 3125 }, { "entropy": 0.17563861726317553, "epoch": 0.2777839017615027, "grad_norm": 11.3125, "learning_rate": 0.0001979548483051711, "loss": 0.20833553314208986, "mean_token_accuracy": 0.9533888497948646, "num_tokens": 3319208.0, "step": 3150 }, { "entropy": 0.17662670996272936, "epoch": 0.279988535902467, "grad_norm": 7.125, "learning_rate": 0.00019790668199897072, "loss": 0.2158352851867676, "mean_token_accuracy": 0.9493980967998504, "num_tokens": 3345268.0, "step": 3175 }, { "entropy": 0.20901564525905997, "epoch": 0.2821931700434313, "grad_norm": 6.3125, "learning_rate": 0.00019785796107940385, "loss": 0.2893720817565918, "mean_token_accuracy": 0.9452108177542686, "num_tokens": 3373762.0, "step": 3200 }, { "entropy": 0.15227315539959818, "epoch": 0.2843978041843956, "grad_norm": 2.84375, "learning_rate": 0.00019780868582246064, "loss": 0.1822994613647461, "mean_token_accuracy": 0.9600504431128501, "num_tokens": 3400602.0, "step": 3225 }, { "entropy": 0.15691042528487742, "epoch": 0.2866024383253599, "grad_norm": 7.375, "learning_rate": 0.0001977588565072713, "loss": 0.23442533493041992, "mean_token_accuracy": 0.9516582262516021, "num_tokens": 3427327.0, "step": 3250 }, { "entropy": 0.15344003664329647, "epoch": 0.2888070724663242, "grad_norm": 7.96875, "learning_rate": 0.0001977084734161047, "loss": 0.19131439208984374, "mean_token_accuracy": 0.9592954310774803, "num_tokens": 3453597.0, "step": 3275 }, { "entropy": 0.17065959631931038, "epoch": 0.29101170660728853, "grad_norm": 6.5, "learning_rate": 0.00019765753683436663, "loss": 0.23813190460205078, "mean_token_accuracy": 0.9509798160195351, "num_tokens": 3480758.0, "step": 3300 }, { "entropy": 0.16926921415608376, "epoch": 0.2932163407482528, "grad_norm": 6.21875, "learning_rate": 0.00019760604705059822, "loss": 0.2048003387451172, "mean_token_accuracy": 0.9565244841575623, "num_tokens": 3506743.0, "step": 3325 }, { "entropy": 0.1781733432924375, "epoch": 0.29542097488921715, "grad_norm": 3.140625, "learning_rate": 0.00019755400435647445, "loss": 0.24166810989379883, "mean_token_accuracy": 0.9541838404536247, "num_tokens": 3533219.0, "step": 3350 }, { "entropy": 0.15910907412180678, "epoch": 0.2976256090301814, "grad_norm": 4.34375, "learning_rate": 0.00019750140904680223, "loss": 0.2406574821472168, "mean_token_accuracy": 0.952444885969162, "num_tokens": 3559206.0, "step": 3375 }, { "entropy": 0.17944070099154488, "epoch": 0.29983024317114576, "grad_norm": 4.78125, "learning_rate": 0.00019744826141951903, "loss": 0.24596309661865234, "mean_token_accuracy": 0.9527160394191742, "num_tokens": 3585456.0, "step": 3400 }, { "entropy": 0.14929035058245063, "epoch": 0.30203487731211004, "grad_norm": 4.40625, "learning_rate": 0.00019739456177569092, "loss": 0.20019350051879883, "mean_token_accuracy": 0.9583090448379517, "num_tokens": 3611511.0, "step": 3425 }, { "entropy": 0.15263430486433208, "epoch": 0.3042395114530744, "grad_norm": 4.53125, "learning_rate": 0.000197340310419511, "loss": 0.19227680206298828, "mean_token_accuracy": 0.956867307126522, "num_tokens": 3637287.0, "step": 3450 }, { "entropy": 0.15233371320180594, "epoch": 0.30644414559403865, "grad_norm": 15.125, "learning_rate": 0.0001972855076582978, "loss": 0.1797281265258789, "mean_token_accuracy": 0.9600564774870872, "num_tokens": 3662617.0, "step": 3475 }, { "entropy": 0.1706169345683884, "epoch": 0.308648779735003, "grad_norm": 5.28125, "learning_rate": 0.0001972301538024932, "loss": 0.22658414840698243, "mean_token_accuracy": 0.9483333799242973, "num_tokens": 3689357.0, "step": 3500 }, { "entropy": 0.19369855729979463, "epoch": 0.31085341387596727, "grad_norm": 13.625, "learning_rate": 0.00019717424916566102, "loss": 0.2644779968261719, "mean_token_accuracy": 0.9458743578195572, "num_tokens": 3718450.0, "step": 3525 }, { "entropy": 0.17568644701968877, "epoch": 0.3130580480169316, "grad_norm": 7.28125, "learning_rate": 0.00019711779406448505, "loss": 0.22591154098510743, "mean_token_accuracy": 0.955237175822258, "num_tokens": 3745358.0, "step": 3550 }, { "entropy": 0.15584104033769108, "epoch": 0.3152626821578959, "grad_norm": 7.40625, "learning_rate": 0.00019706078881876724, "loss": 0.1931936836242676, "mean_token_accuracy": 0.9567597940564155, "num_tokens": 3770330.0, "step": 3575 }, { "entropy": 0.1786574741289951, "epoch": 0.3174673162988602, "grad_norm": 6.59375, "learning_rate": 0.00019700323375142608, "loss": 0.22003387451171874, "mean_token_accuracy": 0.9521878311038017, "num_tokens": 3796581.0, "step": 3600 }, { "entropy": 0.1740282563189976, "epoch": 0.3196719504398245, "grad_norm": 6.84375, "learning_rate": 0.00019694512918849453, "loss": 0.20711589813232423, "mean_token_accuracy": 0.9538468858599662, "num_tokens": 3822606.0, "step": 3625 }, { "entropy": 0.14037064747535624, "epoch": 0.3218765845807888, "grad_norm": 7.96875, "learning_rate": 0.00019688647545911832, "loss": 0.1778187370300293, "mean_token_accuracy": 0.9586274382472039, "num_tokens": 3848390.0, "step": 3650 }, { "entropy": 0.15400171629153192, "epoch": 0.3240812187217531, "grad_norm": 7.6875, "learning_rate": 0.00019682727289555417, "loss": 0.21966312408447267, "mean_token_accuracy": 0.9530961990356446, "num_tokens": 3873771.0, "step": 3675 }, { "entropy": 0.192363264701562, "epoch": 0.32628585286271744, "grad_norm": 5.09375, "learning_rate": 0.00019676752183316753, "loss": 0.2318318748474121, "mean_token_accuracy": 0.9510029405355453, "num_tokens": 3898700.0, "step": 3700 }, { "entropy": 0.15507790248957462, "epoch": 0.3284904870036817, "grad_norm": 6.25, "learning_rate": 0.00019670722261043119, "loss": 0.20907793045043946, "mean_token_accuracy": 0.9551364532113076, "num_tokens": 3925248.0, "step": 3725 }, { "entropy": 0.18347839491441845, "epoch": 0.33069512114464605, "grad_norm": 4.96875, "learning_rate": 0.0001966463755689229, "loss": 0.22271282196044923, "mean_token_accuracy": 0.9531350857019425, "num_tokens": 3952118.0, "step": 3750 }, { "entropy": 0.1369088706956245, "epoch": 0.33289975528561033, "grad_norm": 6.28125, "learning_rate": 0.00019658498105332392, "loss": 0.16287109375, "mean_token_accuracy": 0.9628218576312065, "num_tokens": 3978522.0, "step": 3775 }, { "entropy": 0.15609612919040955, "epoch": 0.33510438942657467, "grad_norm": 7.34375, "learning_rate": 0.0001965230394114165, "loss": 0.19069808959960938, "mean_token_accuracy": 0.9597328254580497, "num_tokens": 4003867.0, "step": 3800 }, { "entropy": 0.1823128617950715, "epoch": 0.33730902356753895, "grad_norm": 2.59375, "learning_rate": 0.0001964605509940824, "loss": 0.24169843673706054, "mean_token_accuracy": 0.9522805500030518, "num_tokens": 4031117.0, "step": 3825 }, { "entropy": 0.15791643084958196, "epoch": 0.3395136577085033, "grad_norm": 7.96875, "learning_rate": 0.00019639751615530059, "loss": 0.19818052291870117, "mean_token_accuracy": 0.9531289768218995, "num_tokens": 4056962.0, "step": 3850 }, { "entropy": 0.16439166482770814, "epoch": 0.34171829184946756, "grad_norm": 4.65625, "learning_rate": 0.00019633393525214548, "loss": 0.21556419372558594, "mean_token_accuracy": 0.955651236474514, "num_tokens": 4083717.0, "step": 3875 }, { "entropy": 0.18368883373914285, "epoch": 0.3439229259904319, "grad_norm": 5.5625, "learning_rate": 0.00019626980864478462, "loss": 0.2704266929626465, "mean_token_accuracy": 0.9481986343860627, "num_tokens": 4111693.0, "step": 3900 }, { "entropy": 0.18578361921128816, "epoch": 0.3461275601313962, "grad_norm": 6.40625, "learning_rate": 0.000196205136696477, "loss": 0.23863527297973633, "mean_token_accuracy": 0.951669595837593, "num_tokens": 4138165.0, "step": 3925 }, { "entropy": 0.1771517199045047, "epoch": 0.3483321942723605, "grad_norm": 4.78125, "learning_rate": 0.00019613991977357066, "loss": 0.23310329437255858, "mean_token_accuracy": 0.9477997007966041, "num_tokens": 4166504.0, "step": 3950 }, { "entropy": 0.18780332374793943, "epoch": 0.3505368284133248, "grad_norm": 5.78125, "learning_rate": 0.00019607415824550087, "loss": 0.23442667007446288, "mean_token_accuracy": 0.9532361942529678, "num_tokens": 4191893.0, "step": 3975 }, { "entropy": 0.18824178458191454, "epoch": 0.3527414625542891, "grad_norm": 6.125, "learning_rate": 0.0001960078524847879, "loss": 0.2508979606628418, "mean_token_accuracy": 0.9472871825098992, "num_tokens": 4218282.0, "step": 4000 }, { "epoch": 0.3527414625542891, "eval_entropy": 0.11648815209642886, "eval_loss": 0.10695337504148483, "eval_mean_token_accuracy": 0.9704467630580386, "eval_num_tokens": 4218282.0, "eval_runtime": 245.6179, "eval_samples_per_second": 15.996, "eval_steps_per_second": 4.002, "step": 4000 }, { "entropy": 0.1659863335173577, "epoch": 0.3549460966952534, "grad_norm": 5.375, "learning_rate": 0.00019594100286703486, "loss": 0.22078191757202148, "mean_token_accuracy": 0.9524767264723778, "num_tokens": 4243916.0, "step": 4025 }, { "entropy": 0.14453170219901948, "epoch": 0.35715073083621773, "grad_norm": 3.03125, "learning_rate": 0.00019587360977092573, "loss": 0.205191707611084, "mean_token_accuracy": 0.9619744899868965, "num_tokens": 4269856.0, "step": 4050 }, { "entropy": 0.1569202733691782, "epoch": 0.359355364977182, "grad_norm": 4.40625, "learning_rate": 0.00019580567357822321, "loss": 0.18136905670166015, "mean_token_accuracy": 0.9600639209151268, "num_tokens": 4295878.0, "step": 4075 }, { "entropy": 0.18661301417043433, "epoch": 0.36155999911814635, "grad_norm": 7.5, "learning_rate": 0.00019573719467376636, "loss": 0.27021623611450196, "mean_token_accuracy": 0.949309915304184, "num_tokens": 4323336.0, "step": 4100 }, { "entropy": 0.170799303437816, "epoch": 0.3637646332591106, "grad_norm": 2.8125, "learning_rate": 0.00019566817344546862, "loss": 0.2194487190246582, "mean_token_accuracy": 0.9553793686628341, "num_tokens": 4348815.0, "step": 4125 }, { "entropy": 0.16419745851657352, "epoch": 0.36596926740007496, "grad_norm": 3.296875, "learning_rate": 0.00019559861028431547, "loss": 0.2253945541381836, "mean_token_accuracy": 0.9522608941793442, "num_tokens": 4375715.0, "step": 4150 }, { "entropy": 0.15910948771517724, "epoch": 0.36817390154103924, "grad_norm": 6.40625, "learning_rate": 0.00019552850558436242, "loss": 0.2144721794128418, "mean_token_accuracy": 0.9533983466029167, "num_tokens": 4401090.0, "step": 4175 }, { "entropy": 0.16045264942571522, "epoch": 0.3703785356820036, "grad_norm": 5.0625, "learning_rate": 0.00019545785974273247, "loss": 0.1865951156616211, "mean_token_accuracy": 0.9595495608448982, "num_tokens": 4426453.0, "step": 4200 }, { "entropy": 0.16479153966531157, "epoch": 0.37258316982296785, "grad_norm": 5.09375, "learning_rate": 0.00019538667315961415, "loss": 0.1973789405822754, "mean_token_accuracy": 0.9551339733600617, "num_tokens": 4451172.0, "step": 4225 }, { "entropy": 0.19048072915524245, "epoch": 0.3747878039639322, "grad_norm": 8.0625, "learning_rate": 0.00019531494623825917, "loss": 0.2748769378662109, "mean_token_accuracy": 0.9468750369548797, "num_tokens": 4479694.0, "step": 4250 }, { "entropy": 0.1996890745626297, "epoch": 0.37699243810489647, "grad_norm": 12.0, "learning_rate": 0.0001952426793849799, "loss": 0.23543556213378905, "mean_token_accuracy": 0.9533787325024605, "num_tokens": 4506605.0, "step": 4275 }, { "entropy": 0.17569803014863283, "epoch": 0.3791970722458608, "grad_norm": 8.6875, "learning_rate": 0.00019516987300914753, "loss": 0.22635995864868164, "mean_token_accuracy": 0.9482355606555939, "num_tokens": 4533336.0, "step": 4300 }, { "entropy": 0.18504171287175267, "epoch": 0.3814017063868251, "grad_norm": 5.09375, "learning_rate": 0.0001950965275231893, "loss": 0.23127546310424804, "mean_token_accuracy": 0.9499982151389122, "num_tokens": 4560672.0, "step": 4325 }, { "entropy": 0.1998164102109149, "epoch": 0.3836063405277894, "grad_norm": 6.71875, "learning_rate": 0.00019502264334258644, "loss": 0.2719668769836426, "mean_token_accuracy": 0.9475091502070427, "num_tokens": 4587955.0, "step": 4350 }, { "entropy": 0.16743096579564734, "epoch": 0.38581097466875375, "grad_norm": 7.6875, "learning_rate": 0.00019494822088587168, "loss": 0.22320510864257812, "mean_token_accuracy": 0.9563138785958291, "num_tokens": 4614744.0, "step": 4375 }, { "entropy": 0.15541932496009395, "epoch": 0.388015608809718, "grad_norm": 5.34375, "learning_rate": 0.00019487326057462704, "loss": 0.18758811950683593, "mean_token_accuracy": 0.9599009090662003, "num_tokens": 4640942.0, "step": 4400 }, { "entropy": 0.153234211106319, "epoch": 0.39022024295068236, "grad_norm": 6.8125, "learning_rate": 0.0001947977628334812, "loss": 0.1849317741394043, "mean_token_accuracy": 0.9574062630534173, "num_tokens": 4666916.0, "step": 4425 }, { "entropy": 0.19829942288808525, "epoch": 0.39242487709164664, "grad_norm": 6.03125, "learning_rate": 0.0001947217280901073, "loss": 0.2654203224182129, "mean_token_accuracy": 0.9488379114866257, "num_tokens": 4693082.0, "step": 4450 }, { "entropy": 0.16388011447619646, "epoch": 0.394629511232611, "grad_norm": 6.65625, "learning_rate": 0.00019464515677522037, "loss": 0.20659452438354492, "mean_token_accuracy": 0.9587969416379929, "num_tokens": 4718146.0, "step": 4475 }, { "entropy": 0.15318717606249266, "epoch": 0.39683414537357525, "grad_norm": 1.7265625, "learning_rate": 0.00019456804932257513, "loss": 0.21035289764404297, "mean_token_accuracy": 0.9577053633332252, "num_tokens": 4745031.0, "step": 4500 }, { "entropy": 0.16182554476428776, "epoch": 0.3990387795145396, "grad_norm": 7.53125, "learning_rate": 0.00019449040616896314, "loss": 0.22390464782714845, "mean_token_accuracy": 0.9582294577360153, "num_tokens": 4770398.0, "step": 4525 }, { "entropy": 0.16457911616773344, "epoch": 0.40124341365550387, "grad_norm": 5.71875, "learning_rate": 0.00019441222775421076, "loss": 0.2118742561340332, "mean_token_accuracy": 0.9571871975064278, "num_tokens": 4795649.0, "step": 4550 }, { "entropy": 0.17897488735150546, "epoch": 0.4034480477964682, "grad_norm": 6.5625, "learning_rate": 0.00019433351452117635, "loss": 0.22478347778320312, "mean_token_accuracy": 0.9507955679297447, "num_tokens": 4821974.0, "step": 4575 }, { "entropy": 0.20635093068238347, "epoch": 0.4056526819374325, "grad_norm": 5.375, "learning_rate": 0.00019425426691574785, "loss": 0.30243276596069335, "mean_token_accuracy": 0.940371046513319, "num_tokens": 4850609.0, "step": 4600 }, { "entropy": 0.16062138011795468, "epoch": 0.4078573160783968, "grad_norm": 6.1875, "learning_rate": 0.00019417448538684026, "loss": 0.20213171005249023, "mean_token_accuracy": 0.9580634304881096, "num_tokens": 4876172.0, "step": 4625 }, { "entropy": 0.15726084834081122, "epoch": 0.4100619502193611, "grad_norm": 4.46875, "learning_rate": 0.00019409417038639322, "loss": 0.2142583656311035, "mean_token_accuracy": 0.9570022109150886, "num_tokens": 4902385.0, "step": 4650 }, { "entropy": 0.17165626873378642, "epoch": 0.4122665843603254, "grad_norm": 7.0, "learning_rate": 0.00019401332236936817, "loss": 0.23059816360473634, "mean_token_accuracy": 0.9527977633476258, "num_tokens": 4929222.0, "step": 4675 }, { "entropy": 0.1640852385875769, "epoch": 0.4144712185012897, "grad_norm": 5.46875, "learning_rate": 0.00019393194179374604, "loss": 0.2273613166809082, "mean_token_accuracy": 0.9548634958267211, "num_tokens": 4955671.0, "step": 4700 }, { "entropy": 0.1570255648170132, "epoch": 0.41667585264225404, "grad_norm": 2.734375, "learning_rate": 0.00019385002912052454, "loss": 0.2143065071105957, "mean_token_accuracy": 0.9560745039582252, "num_tokens": 4982931.0, "step": 4725 }, { "entropy": 0.17041039526229723, "epoch": 0.4188804867832183, "grad_norm": 4.0, "learning_rate": 0.00019376758481371556, "loss": 0.22071348190307616, "mean_token_accuracy": 0.9523491749167442, "num_tokens": 5010878.0, "step": 4750 }, { "entropy": 0.16272783821914344, "epoch": 0.42108512092418265, "grad_norm": 5.84375, "learning_rate": 0.0001936846093403425, "loss": 0.2147397804260254, "mean_token_accuracy": 0.9509309217333793, "num_tokens": 5036498.0, "step": 4775 }, { "entropy": 0.19207242332515306, "epoch": 0.42328975506514693, "grad_norm": 4.125, "learning_rate": 0.00019360110317043772, "loss": 0.2490982246398926, "mean_token_accuracy": 0.9515020102262497, "num_tokens": 5063787.0, "step": 4800 }, { "entropy": 0.17644908792804925, "epoch": 0.42549438920611127, "grad_norm": 9.875, "learning_rate": 0.00019351706677703975, "loss": 0.23829484939575196, "mean_token_accuracy": 0.9441190361976624, "num_tokens": 5091038.0, "step": 4825 }, { "entropy": 0.14317711226758548, "epoch": 0.42769902334707555, "grad_norm": 6.875, "learning_rate": 0.00019343250063619082, "loss": 0.18992048263549804, "mean_token_accuracy": 0.9606717613339424, "num_tokens": 5118165.0, "step": 4850 }, { "entropy": 0.18368843147065492, "epoch": 0.4299036574880399, "grad_norm": 19.0, "learning_rate": 0.00019334740522693392, "loss": 0.2248593521118164, "mean_token_accuracy": 0.9531078413128853, "num_tokens": 5144207.0, "step": 4875 }, { "entropy": 0.1561399988271296, "epoch": 0.43210829162900416, "grad_norm": 5.78125, "learning_rate": 0.00019326178103131017, "loss": 0.2132600212097168, "mean_token_accuracy": 0.9495953798294068, "num_tokens": 5169691.0, "step": 4900 }, { "entropy": 0.14101963342865928, "epoch": 0.4343129257699685, "grad_norm": 2.734375, "learning_rate": 0.0001931756285343562, "loss": 0.18087581634521485, "mean_token_accuracy": 0.9611552309989929, "num_tokens": 5195767.0, "step": 4925 }, { "entropy": 0.16966247914591803, "epoch": 0.4365175599109328, "grad_norm": 3.4375, "learning_rate": 0.0001930889482241013, "loss": 0.2111412239074707, "mean_token_accuracy": 0.9541240110993385, "num_tokens": 5222731.0, "step": 4950 }, { "entropy": 0.1666613586130552, "epoch": 0.4387221940518971, "grad_norm": 7.125, "learning_rate": 0.0001930017405915646, "loss": 0.2021429443359375, "mean_token_accuracy": 0.9588029065728187, "num_tokens": 5249113.0, "step": 4975 }, { "entropy": 0.14360476849251427, "epoch": 0.4409268281928614, "grad_norm": 6.71875, "learning_rate": 0.00019291400613075243, "loss": 0.22467676162719727, "mean_token_accuracy": 0.955079542696476, "num_tokens": 5276016.0, "step": 5000 }, { "epoch": 0.4409268281928614, "eval_entropy": 0.10869147787145353, "eval_loss": 0.10057255625724792, "eval_mean_token_accuracy": 0.9738030348158805, "eval_num_tokens": 5276016.0, "eval_runtime": 227.4944, "eval_samples_per_second": 17.271, "eval_steps_per_second": 4.321, "step": 5000 }, { "entropy": 0.13281712058582343, "epoch": 0.4431314623338257, "grad_norm": 16.75, "learning_rate": 0.00019282574533865542, "loss": 0.1655169105529785, "mean_token_accuracy": 0.9662536835670471, "num_tokens": 5299609.0, "step": 5025 }, { "entropy": 0.15537576926173643, "epoch": 0.44533609647479, "grad_norm": 3.53125, "learning_rate": 0.00019273695871524575, "loss": 0.18183206558227538, "mean_token_accuracy": 0.961754854619503, "num_tokens": 5324890.0, "step": 5050 }, { "entropy": 0.17722081926651298, "epoch": 0.44754073061575433, "grad_norm": 5.53125, "learning_rate": 0.00019264764676347427, "loss": 0.2252979850769043, "mean_token_accuracy": 0.9469957205653191, "num_tokens": 5352166.0, "step": 5075 }, { "entropy": 0.16153283580671995, "epoch": 0.4497453647567186, "grad_norm": 5.96875, "learning_rate": 0.00019255780998926763, "loss": 0.20538576126098632, "mean_token_accuracy": 0.9543823391199112, "num_tokens": 5379076.0, "step": 5100 }, { "entropy": 0.17977383355842902, "epoch": 0.45194999889768295, "grad_norm": 5.53125, "learning_rate": 0.00019246744890152545, "loss": 0.2403662109375, "mean_token_accuracy": 0.9503585311770439, "num_tokens": 5404897.0, "step": 5125 }, { "entropy": 0.14496052238857374, "epoch": 0.4541546330386472, "grad_norm": 5.375, "learning_rate": 0.00019237656401211757, "loss": 0.19734691619873046, "mean_token_accuracy": 0.9596367552876472, "num_tokens": 5431544.0, "step": 5150 }, { "entropy": 0.16713952826918102, "epoch": 0.45635926717961156, "grad_norm": 6.75, "learning_rate": 0.00019228515583588079, "loss": 0.22258760452270507, "mean_token_accuracy": 0.9565337428450584, "num_tokens": 5457123.0, "step": 5175 }, { "entropy": 0.16149105805088765, "epoch": 0.45856390132057584, "grad_norm": 8.5625, "learning_rate": 0.00019219322489061634, "loss": 0.22547231674194335, "mean_token_accuracy": 0.9535848796367645, "num_tokens": 5484041.0, "step": 5200 }, { "entropy": 0.148517404072918, "epoch": 0.4607685354615402, "grad_norm": 3.0625, "learning_rate": 0.00019210077169708675, "loss": 0.2167955207824707, "mean_token_accuracy": 0.9557743620872498, "num_tokens": 5511905.0, "step": 5225 }, { "entropy": 0.1500109832943417, "epoch": 0.46297316960250445, "grad_norm": 4.0625, "learning_rate": 0.00019200779677901295, "loss": 0.21606193542480467, "mean_token_accuracy": 0.9585763025283813, "num_tokens": 5537774.0, "step": 5250 }, { "entropy": 0.1966726200678386, "epoch": 0.4651778037434688, "grad_norm": 5.8125, "learning_rate": 0.00019191430066307124, "loss": 0.23859842300415038, "mean_token_accuracy": 0.9467063054442406, "num_tokens": 5565426.0, "step": 5275 }, { "entropy": 0.17433940736344083, "epoch": 0.46738243788443307, "grad_norm": 4.46875, "learning_rate": 0.0001918202838788904, "loss": 0.224346923828125, "mean_token_accuracy": 0.9562974636256695, "num_tokens": 5591888.0, "step": 5300 }, { "entropy": 0.13106359140831045, "epoch": 0.4695870720253974, "grad_norm": 5.5625, "learning_rate": 0.0001917257469590487, "loss": 0.1792782211303711, "mean_token_accuracy": 0.9603891870379448, "num_tokens": 5617317.0, "step": 5325 }, { "entropy": 0.15177158450707792, "epoch": 0.4717917061663617, "grad_norm": 3.921875, "learning_rate": 0.00019163069043907064, "loss": 0.21172012329101564, "mean_token_accuracy": 0.9570096290111542, "num_tokens": 5643652.0, "step": 5350 }, { "entropy": 0.1602694686234463, "epoch": 0.473996340307326, "grad_norm": 7.65625, "learning_rate": 0.00019153511485742435, "loss": 0.19996971130371094, "mean_token_accuracy": 0.9593239459395408, "num_tokens": 5669256.0, "step": 5375 }, { "entropy": 0.15173581497278066, "epoch": 0.4762009744482903, "grad_norm": 3.71875, "learning_rate": 0.0001914390207555181, "loss": 0.19701797485351563, "mean_token_accuracy": 0.9582894539833069, "num_tokens": 5694392.0, "step": 5400 }, { "entropy": 0.1408918967458885, "epoch": 0.4784056085892546, "grad_norm": 4.28125, "learning_rate": 0.00019134240867769756, "loss": 0.17704242706298828, "mean_token_accuracy": 0.9625860941410065, "num_tokens": 5720793.0, "step": 5425 }, { "entropy": 0.15985940964194015, "epoch": 0.4806102427302189, "grad_norm": 5.59375, "learning_rate": 0.0001912452791712425, "loss": 0.2061847686767578, "mean_token_accuracy": 0.9506743770837783, "num_tokens": 5748648.0, "step": 5450 }, { "entropy": 0.1572291606734507, "epoch": 0.48281487687118324, "grad_norm": 12.5, "learning_rate": 0.00019114763278636385, "loss": 0.1869456672668457, "mean_token_accuracy": 0.9610082852840424, "num_tokens": 5774702.0, "step": 5475 }, { "entropy": 0.15579537914483807, "epoch": 0.4850195110121475, "grad_norm": 3.328125, "learning_rate": 0.00019104947007620045, "loss": 0.22569913864135743, "mean_token_accuracy": 0.9553910180926323, "num_tokens": 5800390.0, "step": 5500 }, { "entropy": 0.1744847889256198, "epoch": 0.48722414515311185, "grad_norm": 5.40625, "learning_rate": 0.00019095079159681596, "loss": 0.2129666328430176, "mean_token_accuracy": 0.9534970700740815, "num_tokens": 5828020.0, "step": 5525 }, { "entropy": 0.12404027001583018, "epoch": 0.48942877929407613, "grad_norm": 3.71875, "learning_rate": 0.0001908515979071958, "loss": 0.13661216735839843, "mean_token_accuracy": 0.9668267214298248, "num_tokens": 5851718.0, "step": 5550 }, { "entropy": 0.15956679730443285, "epoch": 0.49163341343504047, "grad_norm": 8.75, "learning_rate": 0.00019075188956924386, "loss": 0.21861886978149414, "mean_token_accuracy": 0.9555191951990127, "num_tokens": 5879049.0, "step": 5575 }, { "entropy": 0.15942344037815928, "epoch": 0.49383804757600475, "grad_norm": 5.0625, "learning_rate": 0.00019065166714777934, "loss": 0.20734643936157227, "mean_token_accuracy": 0.9620197328925133, "num_tokens": 5904931.0, "step": 5600 }, { "entropy": 0.19217802144237794, "epoch": 0.4960426817169691, "grad_norm": 4.53125, "learning_rate": 0.00019055093121053365, "loss": 0.26606002807617185, "mean_token_accuracy": 0.9508296462893486, "num_tokens": 5931600.0, "step": 5625 }, { "entropy": 0.15677706636604852, "epoch": 0.49824731585793336, "grad_norm": 4.28125, "learning_rate": 0.00019044968232814703, "loss": 0.2043045425415039, "mean_token_accuracy": 0.9544167664647102, "num_tokens": 5957753.0, "step": 5650 }, { "entropy": 0.18609977641841396, "epoch": 0.5004519499988976, "grad_norm": 3.578125, "learning_rate": 0.00019034792107416553, "loss": 0.21894699096679687, "mean_token_accuracy": 0.9534892725944519, "num_tokens": 5985243.0, "step": 5675 }, { "entropy": 0.14997990630334243, "epoch": 0.502656584139862, "grad_norm": 7.3125, "learning_rate": 0.0001902456480250375, "loss": 0.19551092147827148, "mean_token_accuracy": 0.9585751879215241, "num_tokens": 6010402.0, "step": 5700 }, { "entropy": 0.1572682385914959, "epoch": 0.5048612182808263, "grad_norm": 4.34375, "learning_rate": 0.00019014286376011055, "loss": 0.19206081390380858, "mean_token_accuracy": 0.9578534030914306, "num_tokens": 6037075.0, "step": 5725 }, { "entropy": 0.15716539665358142, "epoch": 0.5070658524217906, "grad_norm": 5.03125, "learning_rate": 0.00019003956886162816, "loss": 0.2075516700744629, "mean_token_accuracy": 0.9590855090320111, "num_tokens": 6063439.0, "step": 5750 }, { "entropy": 0.14328026061179117, "epoch": 0.5092704865627549, "grad_norm": 9.3125, "learning_rate": 0.0001899357639147264, "loss": 0.1804123878479004, "mean_token_accuracy": 0.9630845382809639, "num_tokens": 6089276.0, "step": 5775 }, { "entropy": 0.1787011620606063, "epoch": 0.5114751207037193, "grad_norm": 3.296875, "learning_rate": 0.0001898314495074306, "loss": 0.21809492111206055, "mean_token_accuracy": 0.955017312169075, "num_tokens": 6116074.0, "step": 5800 }, { "entropy": 0.14735706645878963, "epoch": 0.5136797548446835, "grad_norm": 11.625, "learning_rate": 0.0001897266262306521, "loss": 0.18583986282348633, "mean_token_accuracy": 0.9612311086058617, "num_tokens": 6142300.0, "step": 5825 }, { "entropy": 0.13354225278948434, "epoch": 0.5158843889856478, "grad_norm": 4.71875, "learning_rate": 0.0001896212946781848, "loss": 0.16306705474853517, "mean_token_accuracy": 0.9689052039384842, "num_tokens": 6167238.0, "step": 5850 }, { "entropy": 0.1298945102340076, "epoch": 0.5180890231266121, "grad_norm": 3.296875, "learning_rate": 0.0001895154554467018, "loss": 0.15991472244262694, "mean_token_accuracy": 0.9638067600131035, "num_tokens": 6193840.0, "step": 5875 }, { "entropy": 0.14238889124128037, "epoch": 0.5202936572675765, "grad_norm": 4.375, "learning_rate": 0.00018940910913575206, "loss": 0.19000024795532228, "mean_token_accuracy": 0.9612208670377731, "num_tokens": 6219503.0, "step": 5900 }, { "entropy": 0.20510360905434935, "epoch": 0.5224982914085408, "grad_norm": 3.625, "learning_rate": 0.00018930225634775715, "loss": 0.26463899612426756, "mean_token_accuracy": 0.9487945803999901, "num_tokens": 6247058.0, "step": 5925 }, { "entropy": 0.1622152561810799, "epoch": 0.524702925549505, "grad_norm": 4.5625, "learning_rate": 0.00018919489768800746, "loss": 0.25773733139038085, "mean_token_accuracy": 0.9568088221549987, "num_tokens": 6273064.0, "step": 5950 }, { "entropy": 0.187178902864689, "epoch": 0.5269075596904693, "grad_norm": 4.15625, "learning_rate": 0.00018908703376465917, "loss": 0.24045007705688476, "mean_token_accuracy": 0.9494410088658333, "num_tokens": 6299137.0, "step": 5975 }, { "entropy": 0.1452439275966026, "epoch": 0.5291121938314337, "grad_norm": 9.625, "learning_rate": 0.00018897866518873053, "loss": 0.21541589736938477, "mean_token_accuracy": 0.9575134646892548, "num_tokens": 6324785.0, "step": 6000 }, { "epoch": 0.5291121938314337, "eval_entropy": 0.10676932354227452, "eval_loss": 0.10486993938684464, "eval_mean_token_accuracy": 0.971874090444035, "eval_num_tokens": 6324785.0, "eval_runtime": 226.9895, "eval_samples_per_second": 17.309, "eval_steps_per_second": 4.331, "step": 6000 }, { "entropy": 0.17648567588767036, "epoch": 0.531316827972398, "grad_norm": 3.734375, "learning_rate": 0.0001888697925740986, "loss": 0.20909486770629881, "mean_token_accuracy": 0.954936962723732, "num_tokens": 6351238.0, "step": 6025 }, { "entropy": 0.16155564374057577, "epoch": 0.5335214621133623, "grad_norm": 1.28125, "learning_rate": 0.00018876041653749552, "loss": 0.1900315284729004, "mean_token_accuracy": 0.9586120668053627, "num_tokens": 6378264.0, "step": 6050 }, { "entropy": 0.15692011128994637, "epoch": 0.5357260962543265, "grad_norm": 6.65625, "learning_rate": 0.00018865053769850538, "loss": 0.2241463279724121, "mean_token_accuracy": 0.9549520462751389, "num_tokens": 6405407.0, "step": 6075 }, { "entropy": 0.16224516270449385, "epoch": 0.5379307303952909, "grad_norm": 5.03125, "learning_rate": 0.00018854015667956034, "loss": 0.20405168533325196, "mean_token_accuracy": 0.9515541243553162, "num_tokens": 6432386.0, "step": 6100 }, { "entropy": 0.1313534512137994, "epoch": 0.5401353645362552, "grad_norm": 3.390625, "learning_rate": 0.00018842927410593732, "loss": 0.16415348052978515, "mean_token_accuracy": 0.9615388405323029, "num_tokens": 6458905.0, "step": 6125 }, { "entropy": 0.1441493243572768, "epoch": 0.5423399986772195, "grad_norm": 4.75, "learning_rate": 0.00018831789060575442, "loss": 0.20023174285888673, "mean_token_accuracy": 0.9581418094038964, "num_tokens": 6485447.0, "step": 6150 }, { "entropy": 0.16584216450923123, "epoch": 0.5445446328181838, "grad_norm": 9.0, "learning_rate": 0.0001882060068099673, "loss": 0.22106365203857423, "mean_token_accuracy": 0.9604924789071083, "num_tokens": 6511693.0, "step": 6175 }, { "entropy": 0.1466969000035897, "epoch": 0.5467492669591482, "grad_norm": 3.546875, "learning_rate": 0.00018809362335236575, "loss": 0.1853495407104492, "mean_token_accuracy": 0.9564072874188423, "num_tokens": 6538084.0, "step": 6200 }, { "entropy": 0.14249630046426318, "epoch": 0.5489539011001124, "grad_norm": 2.890625, "learning_rate": 0.00018798074086956988, "loss": 0.16907304763793946, "mean_token_accuracy": 0.9658751469850541, "num_tokens": 6563262.0, "step": 6225 }, { "entropy": 0.13506768403225577, "epoch": 0.5511585352410767, "grad_norm": 5.15625, "learning_rate": 0.00018786736000102664, "loss": 0.1854391860961914, "mean_token_accuracy": 0.9624115920066834, "num_tokens": 6590039.0, "step": 6250 }, { "entropy": 0.15223577088676393, "epoch": 0.553363169382041, "grad_norm": 7.3125, "learning_rate": 0.00018775348138900632, "loss": 0.20777523040771484, "mean_token_accuracy": 0.9607916563749314, "num_tokens": 6615751.0, "step": 6275 }, { "entropy": 0.1341767003881978, "epoch": 0.5555678035230054, "grad_norm": 6.03125, "learning_rate": 0.00018763910567859868, "loss": 0.18818994522094726, "mean_token_accuracy": 0.9605262127518653, "num_tokens": 6641586.0, "step": 6300 }, { "entropy": 0.1822560296789743, "epoch": 0.5577724376639697, "grad_norm": 4.53125, "learning_rate": 0.00018752423351770943, "loss": 0.21607881546020508, "mean_token_accuracy": 0.9530105289816856, "num_tokens": 6668241.0, "step": 6325 }, { "entropy": 0.15559989049565048, "epoch": 0.559977071804934, "grad_norm": 1.0078125, "learning_rate": 0.00018740886555705647, "loss": 0.18804313659667968, "mean_token_accuracy": 0.9619828999042511, "num_tokens": 6694831.0, "step": 6350 }, { "entropy": 0.15529708388843574, "epoch": 0.5621817059458982, "grad_norm": 5.46875, "learning_rate": 0.00018729300245016642, "loss": 0.21643871307373047, "mean_token_accuracy": 0.9549458172917366, "num_tokens": 6721275.0, "step": 6375 }, { "entropy": 0.14793858348275535, "epoch": 0.5643863400868626, "grad_norm": 2.890625, "learning_rate": 0.00018717664485337057, "loss": 0.19648042678833008, "mean_token_accuracy": 0.9602294343709946, "num_tokens": 6747046.0, "step": 6400 }, { "entropy": 0.17088732279487887, "epoch": 0.5665909742278269, "grad_norm": 4.75, "learning_rate": 0.00018705979342580146, "loss": 0.2485208511352539, "mean_token_accuracy": 0.9500703465938568, "num_tokens": 6774554.0, "step": 6425 }, { "entropy": 0.15431675165193157, "epoch": 0.5687956083687912, "grad_norm": 2.859375, "learning_rate": 0.00018694244882938907, "loss": 0.1909835433959961, "mean_token_accuracy": 0.9602775621414185, "num_tokens": 6800505.0, "step": 6450 }, { "entropy": 0.15024025222170168, "epoch": 0.5710002425097555, "grad_norm": 4.375, "learning_rate": 0.00018682461172885698, "loss": 0.20096403121948242, "mean_token_accuracy": 0.9598821967840194, "num_tokens": 6827587.0, "step": 6475 }, { "entropy": 0.15101411423878744, "epoch": 0.5732048766507198, "grad_norm": 4.75, "learning_rate": 0.00018670628279171862, "loss": 0.2041637420654297, "mean_token_accuracy": 0.9568271943926812, "num_tokens": 6854524.0, "step": 6500 }, { "entropy": 0.17572721259552054, "epoch": 0.5754095107916841, "grad_norm": 6.8125, "learning_rate": 0.0001865874626882737, "loss": 0.2292483901977539, "mean_token_accuracy": 0.956166204214096, "num_tokens": 6880994.0, "step": 6525 }, { "entropy": 0.11857264762860723, "epoch": 0.5776141449326484, "grad_norm": 3.046875, "learning_rate": 0.00018646815209160406, "loss": 0.1415870189666748, "mean_token_accuracy": 0.9724202772974968, "num_tokens": 6906538.0, "step": 6550 }, { "entropy": 0.18178646504296922, "epoch": 0.5798187790736128, "grad_norm": 2.828125, "learning_rate": 0.00018634835167757015, "loss": 0.25645376205444337, "mean_token_accuracy": 0.9543059349060059, "num_tokens": 6934582.0, "step": 6575 }, { "entropy": 0.15401741547277198, "epoch": 0.5820234132145771, "grad_norm": 9.25, "learning_rate": 0.00018622806212480707, "loss": 0.21368270874023437, "mean_token_accuracy": 0.9566551733016968, "num_tokens": 6960422.0, "step": 6600 }, { "entropy": 0.14491610620287246, "epoch": 0.5842280473555413, "grad_norm": 5.71875, "learning_rate": 0.0001861072841147207, "loss": 0.18427518844604493, "mean_token_accuracy": 0.9608854773640633, "num_tokens": 6987570.0, "step": 6625 }, { "entropy": 0.16291028478881345, "epoch": 0.5864326814965056, "grad_norm": 4.09375, "learning_rate": 0.00018598601833148405, "loss": 0.20536699295043945, "mean_token_accuracy": 0.9561599251627922, "num_tokens": 7015159.0, "step": 6650 }, { "entropy": 0.15031578235328197, "epoch": 0.58863731563747, "grad_norm": 5.96875, "learning_rate": 0.00018586426546203302, "loss": 0.2286543083190918, "mean_token_accuracy": 0.9568568438291549, "num_tokens": 7041683.0, "step": 6675 }, { "entropy": 0.15518903702031822, "epoch": 0.5908419497784343, "grad_norm": 7.90625, "learning_rate": 0.00018574202619606287, "loss": 0.20627035140991212, "mean_token_accuracy": 0.9582181671261787, "num_tokens": 7067898.0, "step": 6700 }, { "entropy": 0.1478130117879482, "epoch": 0.5930465839193986, "grad_norm": 8.1875, "learning_rate": 0.0001856193012260241, "loss": 0.17444656372070313, "mean_token_accuracy": 0.9628511995077134, "num_tokens": 7093130.0, "step": 6725 }, { "entropy": 0.16152513926150278, "epoch": 0.5952512180603629, "grad_norm": 5.0, "learning_rate": 0.00018549609124711853, "loss": 0.2013174057006836, "mean_token_accuracy": 0.9573907378315926, "num_tokens": 7119773.0, "step": 6750 }, { "entropy": 0.16015219626016916, "epoch": 0.5974558522013272, "grad_norm": 4.375, "learning_rate": 0.0001853723969572955, "loss": 0.20601200103759765, "mean_token_accuracy": 0.9583083838224411, "num_tokens": 7147090.0, "step": 6775 }, { "entropy": 0.12353788227774203, "epoch": 0.5996604863422915, "grad_norm": 3.734375, "learning_rate": 0.00018524821905724782, "loss": 0.1696053123474121, "mean_token_accuracy": 0.9629499089717865, "num_tokens": 7173628.0, "step": 6800 }, { "entropy": 0.13745511685323436, "epoch": 0.6018651204832558, "grad_norm": 7.625, "learning_rate": 0.0001851235582504078, "loss": 0.19153583526611329, "mean_token_accuracy": 0.962202197611332, "num_tokens": 7199491.0, "step": 6825 }, { "entropy": 0.15157184965210035, "epoch": 0.6040697546242201, "grad_norm": 4.125, "learning_rate": 0.00018499841524294324, "loss": 0.21617828369140624, "mean_token_accuracy": 0.956987452507019, "num_tokens": 7225688.0, "step": 6850 }, { "entropy": 0.16270094153587705, "epoch": 0.6062743887651845, "grad_norm": 5.0, "learning_rate": 0.00018487279074375353, "loss": 0.19658201217651367, "mean_token_accuracy": 0.9590710332989693, "num_tokens": 7251979.0, "step": 6875 }, { "entropy": 0.13592043185140937, "epoch": 0.6084790229061487, "grad_norm": 4.1875, "learning_rate": 0.00018474668546446555, "loss": 0.17008283615112305, "mean_token_accuracy": 0.9622329398989677, "num_tokens": 7278416.0, "step": 6900 }, { "entropy": 0.15448786061489955, "epoch": 0.610683657047113, "grad_norm": 5.59375, "learning_rate": 0.0001846201001194296, "loss": 0.2108542251586914, "mean_token_accuracy": 0.9560682138800621, "num_tokens": 7305245.0, "step": 6925 }, { "entropy": 0.17204750362201593, "epoch": 0.6128882911880773, "grad_norm": 3.1875, "learning_rate": 0.0001844930354257156, "loss": 0.22721254348754882, "mean_token_accuracy": 0.9545628592371941, "num_tokens": 7331880.0, "step": 6950 }, { "entropy": 0.17656717666075564, "epoch": 0.6150929253290417, "grad_norm": 4.4375, "learning_rate": 0.00018436549210310862, "loss": 0.23982276916503906, "mean_token_accuracy": 0.9470600582659244, "num_tokens": 7358764.0, "step": 6975 }, { "entropy": 0.14622056474676356, "epoch": 0.617297559470006, "grad_norm": 2.3125, "learning_rate": 0.00018423747087410513, "loss": 0.17314342498779298, "mean_token_accuracy": 0.9619392481446266, "num_tokens": 7385487.0, "step": 7000 }, { "epoch": 0.617297559470006, "eval_entropy": 0.07945120481571462, "eval_loss": 0.08194578438997269, "eval_mean_token_accuracy": 0.9766436420525079, "eval_num_tokens": 7385487.0, "eval_runtime": 240.9795, "eval_samples_per_second": 16.304, "eval_steps_per_second": 4.079, "step": 7000 }, { "entropy": 0.13026506319874898, "epoch": 0.6195021936109703, "grad_norm": 7.40625, "learning_rate": 0.0001841089724639088, "loss": 0.1890636444091797, "mean_token_accuracy": 0.9626975417137146, "num_tokens": 7410738.0, "step": 7025 }, { "entropy": 0.1509742072003428, "epoch": 0.6217068277519345, "grad_norm": 5.09375, "learning_rate": 0.00018397999760042644, "loss": 0.21692123413085937, "mean_token_accuracy": 0.9581372204422951, "num_tokens": 7437696.0, "step": 7050 }, { "entropy": 0.1558595033886377, "epoch": 0.6239114618928989, "grad_norm": 5.15625, "learning_rate": 0.00018385054701426372, "loss": 0.22689201354980468, "mean_token_accuracy": 0.9588596966862678, "num_tokens": 7463958.0, "step": 7075 }, { "entropy": 0.13060404141549953, "epoch": 0.6261160960338632, "grad_norm": 4.875, "learning_rate": 0.00018372062143872127, "loss": 0.1784954261779785, "mean_token_accuracy": 0.9633405435085297, "num_tokens": 7489503.0, "step": 7100 }, { "entropy": 0.14149722107453272, "epoch": 0.6283207301748275, "grad_norm": 7.34375, "learning_rate": 0.00018359022160979027, "loss": 0.1862514877319336, "mean_token_accuracy": 0.9588928538560867, "num_tokens": 7515587.0, "step": 7125 }, { "entropy": 0.14298222974175587, "epoch": 0.6305253643157918, "grad_norm": 4.21875, "learning_rate": 0.0001834593482661485, "loss": 0.1809256362915039, "mean_token_accuracy": 0.9628977358341217, "num_tokens": 7542878.0, "step": 7150 }, { "entropy": 0.13202147453674115, "epoch": 0.6327299984567561, "grad_norm": 4.71875, "learning_rate": 0.0001833280021491561, "loss": 0.18633668899536132, "mean_token_accuracy": 0.9624699577689171, "num_tokens": 7569706.0, "step": 7175 }, { "entropy": 0.15507492707343773, "epoch": 0.6349346325977204, "grad_norm": 6.625, "learning_rate": 0.00018319618400285115, "loss": 0.20065113067626952, "mean_token_accuracy": 0.9619992870092392, "num_tokens": 7596763.0, "step": 7200 }, { "entropy": 0.15959744892315939, "epoch": 0.6371392667386847, "grad_norm": 8.6875, "learning_rate": 0.0001830638945739459, "loss": 0.20813972473144532, "mean_token_accuracy": 0.956459388434887, "num_tokens": 7623283.0, "step": 7225 }, { "entropy": 0.15210882770828904, "epoch": 0.639343900879649, "grad_norm": 5.1875, "learning_rate": 0.00018293113461182204, "loss": 0.20834075927734375, "mean_token_accuracy": 0.9586859959363937, "num_tokens": 7649230.0, "step": 7250 }, { "entropy": 0.14857870964333414, "epoch": 0.6415485350206134, "grad_norm": 4.0, "learning_rate": 0.00018279790486852693, "loss": 0.19865785598754881, "mean_token_accuracy": 0.9572332391142845, "num_tokens": 7676765.0, "step": 7275 }, { "entropy": 0.1303456796729006, "epoch": 0.6437531691615777, "grad_norm": 2.4375, "learning_rate": 0.00018266420609876885, "loss": 0.16255685806274414, "mean_token_accuracy": 0.963574868440628, "num_tokens": 7702032.0, "step": 7300 }, { "entropy": 0.1390564618736971, "epoch": 0.6459578033025419, "grad_norm": 5.125, "learning_rate": 0.0001825300390599132, "loss": 0.15615715980529785, "mean_token_accuracy": 0.9627314421534539, "num_tokens": 7727282.0, "step": 7325 }, { "entropy": 0.15579893960035407, "epoch": 0.6481624374435062, "grad_norm": 2.515625, "learning_rate": 0.0001823954045119779, "loss": 0.1994219970703125, "mean_token_accuracy": 0.9553171017765999, "num_tokens": 7754466.0, "step": 7350 }, { "entropy": 0.1711575994535815, "epoch": 0.6503670715844706, "grad_norm": 5.34375, "learning_rate": 0.0001822603032176291, "loss": 0.2133725929260254, "mean_token_accuracy": 0.9545243400335311, "num_tokens": 7781859.0, "step": 7375 }, { "entropy": 0.12246490114834159, "epoch": 0.6525717057254349, "grad_norm": 4.625, "learning_rate": 0.00018212473594217708, "loss": 0.15401289939880372, "mean_token_accuracy": 0.9670217049121856, "num_tokens": 7807817.0, "step": 7400 }, { "entropy": 0.14397913629829417, "epoch": 0.6547763398663992, "grad_norm": 4.71875, "learning_rate": 0.00018198870345357169, "loss": 0.2187386131286621, "mean_token_accuracy": 0.9631443306803703, "num_tokens": 7834226.0, "step": 7425 }, { "entropy": 0.12889255251706344, "epoch": 0.6569809740073634, "grad_norm": 4.0625, "learning_rate": 0.00018185220652239807, "loss": 0.17164052963256837, "mean_token_accuracy": 0.9652298155426979, "num_tokens": 7860383.0, "step": 7450 }, { "entropy": 0.14436891936697066, "epoch": 0.6591856081483278, "grad_norm": 2.625, "learning_rate": 0.00018171524592187237, "loss": 0.1861957550048828, "mean_token_accuracy": 0.9579941752552986, "num_tokens": 7885728.0, "step": 7475 }, { "entropy": 0.15438391114235855, "epoch": 0.6613902422892921, "grad_norm": 5.625, "learning_rate": 0.00018157782242783722, "loss": 0.21105031967163085, "mean_token_accuracy": 0.9592954632639885, "num_tokens": 7912175.0, "step": 7500 }, { "entropy": 0.1324482882732991, "epoch": 0.6635948764302564, "grad_norm": 2.09375, "learning_rate": 0.00018143993681875737, "loss": 0.1580478572845459, "mean_token_accuracy": 0.9634089484810829, "num_tokens": 7938822.0, "step": 7525 }, { "entropy": 0.14074511501239612, "epoch": 0.6657995105712207, "grad_norm": 8.625, "learning_rate": 0.00018130158987571547, "loss": 0.1955801010131836, "mean_token_accuracy": 0.9611077249050141, "num_tokens": 7964876.0, "step": 7550 }, { "entropy": 0.13242758714361116, "epoch": 0.668004144712185, "grad_norm": 4.78125, "learning_rate": 0.00018116278238240735, "loss": 0.15283177375793458, "mean_token_accuracy": 0.9638253870606422, "num_tokens": 7990616.0, "step": 7575 }, { "entropy": 0.11454111101804301, "epoch": 0.6702087788531493, "grad_norm": 5.9375, "learning_rate": 0.0001810235151251378, "loss": 0.14967763900756836, "mean_token_accuracy": 0.9682707318663597, "num_tokens": 8016050.0, "step": 7600 }, { "entropy": 0.11883930406940635, "epoch": 0.6724134129941136, "grad_norm": 4.34375, "learning_rate": 0.00018088378889281602, "loss": 0.15552496910095215, "mean_token_accuracy": 0.9669474244117737, "num_tokens": 8042320.0, "step": 7625 }, { "entropy": 0.1356273195706308, "epoch": 0.6746180471350779, "grad_norm": 6.625, "learning_rate": 0.00018074360447695113, "loss": 0.19098442077636718, "mean_token_accuracy": 0.9609265148639679, "num_tokens": 8068871.0, "step": 7650 }, { "entropy": 0.15299125998280944, "epoch": 0.6768226812760423, "grad_norm": 5.21875, "learning_rate": 0.00018060296267164789, "loss": 0.17693784713745117, "mean_token_accuracy": 0.9612268942594528, "num_tokens": 8094092.0, "step": 7675 }, { "entropy": 0.12178941715159454, "epoch": 0.6790273154170066, "grad_norm": 4.875, "learning_rate": 0.00018046186427360177, "loss": 0.17469738006591798, "mean_token_accuracy": 0.9637601950764656, "num_tokens": 8120040.0, "step": 7700 }, { "entropy": 0.14162486122571863, "epoch": 0.6812319495579708, "grad_norm": 6.0, "learning_rate": 0.00018032031008209502, "loss": 0.19666408538818358, "mean_token_accuracy": 0.9620895192027092, "num_tokens": 8145821.0, "step": 7725 }, { "entropy": 0.1300551049981732, "epoch": 0.6834365836989351, "grad_norm": 2.828125, "learning_rate": 0.00018017830089899154, "loss": 0.202286434173584, "mean_token_accuracy": 0.96599944293499, "num_tokens": 8172465.0, "step": 7750 }, { "entropy": 0.13608546175644734, "epoch": 0.6856412178398995, "grad_norm": 2.171875, "learning_rate": 0.00018003583752873283, "loss": 0.19469423294067384, "mean_token_accuracy": 0.9608579614758491, "num_tokens": 8199428.0, "step": 7775 }, { "entropy": 0.17455945937603246, "epoch": 0.6878458519808638, "grad_norm": 7.0625, "learning_rate": 0.00017989292077833313, "loss": 0.22791748046875, "mean_token_accuracy": 0.9548394048213958, "num_tokens": 8226599.0, "step": 7800 }, { "entropy": 0.11592018370109144, "epoch": 0.6900504861218281, "grad_norm": 5.0625, "learning_rate": 0.000179749551457375, "loss": 0.16286245346069336, "mean_token_accuracy": 0.9658330598473549, "num_tokens": 8253197.0, "step": 7825 }, { "entropy": 0.12122963964822703, "epoch": 0.6922551202627923, "grad_norm": 8.125, "learning_rate": 0.00017960573037800463, "loss": 0.1468034553527832, "mean_token_accuracy": 0.9656573352217674, "num_tokens": 8278589.0, "step": 7850 }, { "entropy": 0.13613144385279155, "epoch": 0.6944597544037567, "grad_norm": 2.4375, "learning_rate": 0.00017946145835492735, "loss": 0.1826793670654297, "mean_token_accuracy": 0.9617566013336182, "num_tokens": 8305687.0, "step": 7875 }, { "entropy": 0.129446152941091, "epoch": 0.696664388544721, "grad_norm": 4.125, "learning_rate": 0.0001793167362054029, "loss": 0.19100290298461914, "mean_token_accuracy": 0.9633017927408218, "num_tokens": 8331350.0, "step": 7900 }, { "entropy": 0.11511508471041453, "epoch": 0.6988690226856853, "grad_norm": 3.03125, "learning_rate": 0.0001791715647492409, "loss": 0.14485804557800294, "mean_token_accuracy": 0.9666005888581276, "num_tokens": 8357509.0, "step": 7925 }, { "entropy": 0.1282959767448483, "epoch": 0.7010736568266496, "grad_norm": 2.734375, "learning_rate": 0.00017902594480879622, "loss": 0.17366495132446289, "mean_token_accuracy": 0.9591986963152885, "num_tokens": 8384200.0, "step": 7950 }, { "entropy": 0.1282369938242482, "epoch": 0.703278290967614, "grad_norm": 5.53125, "learning_rate": 0.00017887987720896406, "loss": 0.1773942756652832, "mean_token_accuracy": 0.9644914370775223, "num_tokens": 8410909.0, "step": 7975 }, { "entropy": 0.1158729085046798, "epoch": 0.7054829251085782, "grad_norm": 4.34375, "learning_rate": 0.00017873336277717574, "loss": 0.1543659210205078, "mean_token_accuracy": 0.9665400749444961, "num_tokens": 8436352.0, "step": 8000 }, { "epoch": 0.7054829251085782, "eval_entropy": 0.07064501758729272, "eval_loss": 0.09072922170162201, "eval_mean_token_accuracy": 0.9757660988752379, "eval_num_tokens": 8436352.0, "eval_runtime": 235.5705, "eval_samples_per_second": 16.679, "eval_steps_per_second": 4.173, "step": 8000 }, { "entropy": 0.11761951618886086, "epoch": 0.7076875592495425, "grad_norm": 3.046875, "learning_rate": 0.0001785864023433936, "loss": 0.1570788288116455, "mean_token_accuracy": 0.9673075690865517, "num_tokens": 8461673.0, "step": 8025 }, { "entropy": 0.11300159072328825, "epoch": 0.7098921933905068, "grad_norm": 3.9375, "learning_rate": 0.00017843899674010641, "loss": 0.15068492889404297, "mean_token_accuracy": 0.9611266851425171, "num_tokens": 8488491.0, "step": 8050 }, { "entropy": 0.12430981354729738, "epoch": 0.7120968275314712, "grad_norm": 4.1875, "learning_rate": 0.0001782911468023249, "loss": 0.17625951766967773, "mean_token_accuracy": 0.962748230099678, "num_tokens": 8514512.0, "step": 8075 }, { "entropy": 0.12553293525765186, "epoch": 0.7143014616724355, "grad_norm": 5.3125, "learning_rate": 0.00017814285336757664, "loss": 0.16704462051391603, "mean_token_accuracy": 0.96374351978302, "num_tokens": 8540515.0, "step": 8100 }, { "entropy": 0.11906734269752633, "epoch": 0.7165060958133997, "grad_norm": 5.375, "learning_rate": 0.00017799411727590153, "loss": 0.14968764305114746, "mean_token_accuracy": 0.9667163553833962, "num_tokens": 8565892.0, "step": 8125 }, { "entropy": 0.1386713864444755, "epoch": 0.718710729954364, "grad_norm": 4.59375, "learning_rate": 0.00017784493936984705, "loss": 0.19567285537719725, "mean_token_accuracy": 0.9598614898324013, "num_tokens": 8592214.0, "step": 8150 }, { "entropy": 0.10956784428621176, "epoch": 0.7209153640953284, "grad_norm": 6.59375, "learning_rate": 0.0001776953204944634, "loss": 0.1550135135650635, "mean_token_accuracy": 0.9683659729361535, "num_tokens": 8618032.0, "step": 8175 }, { "entropy": 0.11613133529463085, "epoch": 0.7231199982362927, "grad_norm": 5.4375, "learning_rate": 0.00017754526149729868, "loss": 0.15899131774902345, "mean_token_accuracy": 0.9638846132159233, "num_tokens": 8644256.0, "step": 8200 }, { "entropy": 0.14448100091249216, "epoch": 0.725324632377257, "grad_norm": 3.0625, "learning_rate": 0.00017739476322839427, "loss": 0.17772165298461914, "mean_token_accuracy": 0.9560806438326835, "num_tokens": 8670775.0, "step": 8225 }, { "entropy": 0.12614294623752356, "epoch": 0.7275292665182213, "grad_norm": 4.4375, "learning_rate": 0.00017724382654027985, "loss": 0.1638924217224121, "mean_token_accuracy": 0.9645272868871689, "num_tokens": 8697753.0, "step": 8250 }, { "entropy": 0.12692207982821857, "epoch": 0.7297339006591856, "grad_norm": 3.265625, "learning_rate": 0.00017709245228796856, "loss": 0.1772811508178711, "mean_token_accuracy": 0.9636184599995613, "num_tokens": 8724660.0, "step": 8275 }, { "entropy": 0.12508263059076852, "epoch": 0.7319385348001499, "grad_norm": 8.0, "learning_rate": 0.00017694064132895232, "loss": 0.16712051391601562, "mean_token_accuracy": 0.9643837228417397, "num_tokens": 8750683.0, "step": 8300 }, { "entropy": 0.14396141051198355, "epoch": 0.7341431689411142, "grad_norm": 1.8671875, "learning_rate": 0.0001767883945231968, "loss": 0.19680845260620117, "mean_token_accuracy": 0.9622863036394119, "num_tokens": 8777248.0, "step": 8325 }, { "entropy": 0.1027900730050169, "epoch": 0.7363478030820785, "grad_norm": 5.3125, "learning_rate": 0.00017663571273313658, "loss": 0.15053895950317384, "mean_token_accuracy": 0.9678994670510292, "num_tokens": 8803326.0, "step": 8350 }, { "entropy": 0.1591771748010069, "epoch": 0.7385524372230429, "grad_norm": 4.21875, "learning_rate": 0.00017648259682367042, "loss": 0.1984667205810547, "mean_token_accuracy": 0.9540786528587342, "num_tokens": 8830566.0, "step": 8375 }, { "entropy": 0.1073954068531748, "epoch": 0.7407570713640071, "grad_norm": 6.0, "learning_rate": 0.00017632904766215618, "loss": 0.13572030067443847, "mean_token_accuracy": 0.9658256524801254, "num_tokens": 8855529.0, "step": 8400 }, { "entropy": 0.12134030096232891, "epoch": 0.7429617055049714, "grad_norm": 2.9375, "learning_rate": 0.00017617506611840596, "loss": 0.1593620204925537, "mean_token_accuracy": 0.9623824095726013, "num_tokens": 8883268.0, "step": 8425 }, { "entropy": 0.1269017940975027, "epoch": 0.7451663396459357, "grad_norm": 3.171875, "learning_rate": 0.00017602065306468118, "loss": 0.15877607345581055, "mean_token_accuracy": 0.965101033449173, "num_tokens": 8908917.0, "step": 8450 }, { "entropy": 0.11544961552310269, "epoch": 0.7473709737869001, "grad_norm": 7.375, "learning_rate": 0.00017586580937568763, "loss": 0.15559195518493651, "mean_token_accuracy": 0.9670590379834175, "num_tokens": 8934706.0, "step": 8475 }, { "entropy": 0.13363998796266968, "epoch": 0.7495756079278644, "grad_norm": 6.84375, "learning_rate": 0.00017571053592857055, "loss": 0.19044275283813478, "mean_token_accuracy": 0.9597599840164185, "num_tokens": 8962668.0, "step": 8500 }, { "entropy": 0.11302992556476965, "epoch": 0.7517802420688287, "grad_norm": 4.78125, "learning_rate": 0.00017555483360290968, "loss": 0.1435883617401123, "mean_token_accuracy": 0.969543283879757, "num_tokens": 8987940.0, "step": 8525 }, { "entropy": 0.12368935061385855, "epoch": 0.7539848762097929, "grad_norm": 3.1875, "learning_rate": 0.0001753987032807141, "loss": 0.15028656005859375, "mean_token_accuracy": 0.9645920068025589, "num_tokens": 9012995.0, "step": 8550 }, { "entropy": 0.12879173549008555, "epoch": 0.7561895103507573, "grad_norm": 5.375, "learning_rate": 0.00017524214584641756, "loss": 0.16836380004882812, "mean_token_accuracy": 0.963255665898323, "num_tokens": 9039182.0, "step": 8575 }, { "entropy": 0.13748559794155882, "epoch": 0.7583941444917216, "grad_norm": 6.40625, "learning_rate": 0.0001750851621868731, "loss": 0.1914215850830078, "mean_token_accuracy": 0.9576075798273087, "num_tokens": 9066623.0, "step": 8600 }, { "entropy": 0.13106511580059305, "epoch": 0.7605987786326859, "grad_norm": 5.625, "learning_rate": 0.00017492775319134828, "loss": 0.1786070442199707, "mean_token_accuracy": 0.9630159053206444, "num_tokens": 9093239.0, "step": 8625 }, { "entropy": 0.11938790226588025, "epoch": 0.7628034127736502, "grad_norm": 2.3125, "learning_rate": 0.0001747699197515201, "loss": 0.17017173767089844, "mean_token_accuracy": 0.9645773348212242, "num_tokens": 9120282.0, "step": 8650 }, { "entropy": 0.11365445770847145, "epoch": 0.7650080469146145, "grad_norm": 3.4375, "learning_rate": 0.00017461166276146986, "loss": 0.16278697967529296, "mean_token_accuracy": 0.9636175134778022, "num_tokens": 9146109.0, "step": 8675 }, { "entropy": 0.11282925648498349, "epoch": 0.7672126810555788, "grad_norm": 16.0, "learning_rate": 0.00017445298311767818, "loss": 0.1740219497680664, "mean_token_accuracy": 0.9688749271631241, "num_tokens": 9172545.0, "step": 8700 }, { "entropy": 0.154731562990346, "epoch": 0.7694173151965431, "grad_norm": 3.765625, "learning_rate": 0.00017429388171901988, "loss": 0.20200132369995116, "mean_token_accuracy": 0.957536070048809, "num_tokens": 9199798.0, "step": 8725 }, { "entropy": 0.11487452790664975, "epoch": 0.7716219493375075, "grad_norm": 4.53125, "learning_rate": 0.00017413435946675887, "loss": 0.16370033264160155, "mean_token_accuracy": 0.9644585126638412, "num_tokens": 9226606.0, "step": 8750 }, { "entropy": 0.12250090444111265, "epoch": 0.7738265834784718, "grad_norm": 4.3125, "learning_rate": 0.00017397441726454312, "loss": 0.1650483512878418, "mean_token_accuracy": 0.9627638322114944, "num_tokens": 9253846.0, "step": 8775 }, { "entropy": 0.12166486009780783, "epoch": 0.776031217619436, "grad_norm": 4.125, "learning_rate": 0.00017381405601839953, "loss": 0.15638877868652343, "mean_token_accuracy": 0.9642614278197289, "num_tokens": 9280499.0, "step": 8800 }, { "entropy": 0.14424598600366154, "epoch": 0.7782358517604003, "grad_norm": 3.109375, "learning_rate": 0.0001736532766367287, "loss": 0.19656600952148437, "mean_token_accuracy": 0.9637108337879181, "num_tokens": 9306050.0, "step": 8825 }, { "entropy": 0.10348674034525175, "epoch": 0.7804404859013647, "grad_norm": 3.859375, "learning_rate": 0.00017349208003029985, "loss": 0.13371842384338378, "mean_token_accuracy": 0.9682561081647872, "num_tokens": 9331578.0, "step": 8850 }, { "entropy": 0.12325756757461931, "epoch": 0.782645120042329, "grad_norm": 3.34375, "learning_rate": 0.00017333046711224566, "loss": 0.1538134765625, "mean_token_accuracy": 0.9658476275205612, "num_tokens": 9356722.0, "step": 8875 }, { "entropy": 0.11024700316367671, "epoch": 0.7848497541832933, "grad_norm": 3.265625, "learning_rate": 0.00017316843879805713, "loss": 0.14064696311950683, "mean_token_accuracy": 0.9689941748976707, "num_tokens": 9383085.0, "step": 8900 }, { "entropy": 0.12162997972161975, "epoch": 0.7870543883242576, "grad_norm": 1.5390625, "learning_rate": 0.0001730059960055784, "loss": 0.17535165786743165, "mean_token_accuracy": 0.9637424668669701, "num_tokens": 9409568.0, "step": 8925 }, { "entropy": 0.13874562435550616, "epoch": 0.789259022465222, "grad_norm": 9.1875, "learning_rate": 0.00017284313965500135, "loss": 0.1936268424987793, "mean_token_accuracy": 0.9646750277280808, "num_tokens": 9437494.0, "step": 8950 }, { "entropy": 0.09118764624232427, "epoch": 0.7914636566061862, "grad_norm": 0.94921875, "learning_rate": 0.00017267987066886073, "loss": 0.09851828575134278, "mean_token_accuracy": 0.9754169529676437, "num_tokens": 9462219.0, "step": 8975 }, { "entropy": 0.12636590646347032, "epoch": 0.7936682907471505, "grad_norm": 2.796875, "learning_rate": 0.00017251618997202864, "loss": 0.19156661987304688, "mean_token_accuracy": 0.9624759146571159, "num_tokens": 9489546.0, "step": 9000 }, { "epoch": 0.7936682907471505, "eval_entropy": 0.06704333229587316, "eval_loss": 0.07078936696052551, "eval_mean_token_accuracy": 0.9801562535192814, "eval_num_tokens": 9489546.0, "eval_runtime": 227.2118, "eval_samples_per_second": 17.292, "eval_steps_per_second": 4.326, "step": 9000 }, { "entropy": 0.12248482231225352, "epoch": 0.7958729248881148, "grad_norm": 4.28125, "learning_rate": 0.0001723520984917095, "loss": 0.21896730422973631, "mean_token_accuracy": 0.9658117419481278, "num_tokens": 9516203.0, "step": 9025 }, { "entropy": 0.13453102754894644, "epoch": 0.7980775590290792, "grad_norm": 3.90625, "learning_rate": 0.00017218759715743453, "loss": 0.16762969970703126, "mean_token_accuracy": 0.9634674605727196, "num_tokens": 9544002.0, "step": 9050 }, { "entropy": 0.1296102111687651, "epoch": 0.8002821931700435, "grad_norm": 3.71875, "learning_rate": 0.00017202268690105684, "loss": 0.1634498977661133, "mean_token_accuracy": 0.9662830474972725, "num_tokens": 9570760.0, "step": 9075 }, { "entropy": 0.09645530783745926, "epoch": 0.8024868273110077, "grad_norm": 1.5, "learning_rate": 0.00017185736865674592, "loss": 0.1314550495147705, "mean_token_accuracy": 0.9713372036814689, "num_tokens": 9597145.0, "step": 9100 }, { "entropy": 0.1334079783054767, "epoch": 0.804691461451972, "grad_norm": 3.859375, "learning_rate": 0.00017169164336098227, "loss": 0.18433303833007814, "mean_token_accuracy": 0.9601713407039643, "num_tokens": 9623937.0, "step": 9125 }, { "entropy": 0.12334824909456074, "epoch": 0.8068960955929364, "grad_norm": 4.03125, "learning_rate": 0.0001715255119525524, "loss": 0.16472867965698243, "mean_token_accuracy": 0.9639821222424507, "num_tokens": 9650956.0, "step": 9150 }, { "entropy": 0.11855432161130011, "epoch": 0.8091007297339007, "grad_norm": 5.90625, "learning_rate": 0.0001713589753725432, "loss": 0.16404468536376954, "mean_token_accuracy": 0.9677329239249229, "num_tokens": 9677654.0, "step": 9175 }, { "entropy": 0.13992778839776293, "epoch": 0.811305363874865, "grad_norm": 2.203125, "learning_rate": 0.00017119203456433682, "loss": 0.17372175216674804, "mean_token_accuracy": 0.9608005279302597, "num_tokens": 9703015.0, "step": 9200 }, { "entropy": 0.11712702068965881, "epoch": 0.8135099980158292, "grad_norm": 2.171875, "learning_rate": 0.00017102469047360525, "loss": 0.14785463333129883, "mean_token_accuracy": 0.969508444070816, "num_tokens": 9728261.0, "step": 9225 }, { "entropy": 0.12233627852867357, "epoch": 0.8157146321567936, "grad_norm": 2.5625, "learning_rate": 0.00017085694404830485, "loss": 0.16637359619140624, "mean_token_accuracy": 0.9676308873295784, "num_tokens": 9754044.0, "step": 9250 }, { "entropy": 0.11624026085250079, "epoch": 0.8179192662977579, "grad_norm": 14.375, "learning_rate": 0.00017068879623867122, "loss": 0.18700075149536133, "mean_token_accuracy": 0.9679171699285507, "num_tokens": 9780803.0, "step": 9275 }, { "entropy": 0.12290247712633573, "epoch": 0.8201239004387222, "grad_norm": 2.03125, "learning_rate": 0.0001705202479972136, "loss": 0.15535794258117674, "mean_token_accuracy": 0.9672988221049309, "num_tokens": 9806834.0, "step": 9300 }, { "entropy": 0.11379165446502157, "epoch": 0.8223285345796865, "grad_norm": 2.875, "learning_rate": 0.00017035130027870965, "loss": 0.13857620239257812, "mean_token_accuracy": 0.9699993515014649, "num_tokens": 9833002.0, "step": 9325 }, { "entropy": 0.10841709365835413, "epoch": 0.8245331687206509, "grad_norm": 2.40625, "learning_rate": 0.00017018195404019983, "loss": 0.1723048782348633, "mean_token_accuracy": 0.9657841363549232, "num_tokens": 9859181.0, "step": 9350 }, { "entropy": 0.11595107901492156, "epoch": 0.8267378028616151, "grad_norm": 4.8125, "learning_rate": 0.00017001221024098224, "loss": 0.1496266269683838, "mean_token_accuracy": 0.9674661475419998, "num_tokens": 9886731.0, "step": 9375 }, { "entropy": 0.11216982064011972, "epoch": 0.8289424370025794, "grad_norm": 4.25, "learning_rate": 0.00016984206984260695, "loss": 0.14420658111572265, "mean_token_accuracy": 0.9676317548751832, "num_tokens": 9911898.0, "step": 9400 }, { "entropy": 0.11021179292700253, "epoch": 0.8311470711435437, "grad_norm": 5.65625, "learning_rate": 0.00016967153380887068, "loss": 0.16620290756225586, "mean_token_accuracy": 0.964969280064106, "num_tokens": 9938802.0, "step": 9425 }, { "entropy": 0.11348463983624242, "epoch": 0.8333517052845081, "grad_norm": 2.0, "learning_rate": 0.00016950060310581133, "loss": 0.1321702194213867, "mean_token_accuracy": 0.9677722403407096, "num_tokens": 9964208.0, "step": 9450 }, { "entropy": 0.11066792839614209, "epoch": 0.8355563394254724, "grad_norm": 5.0, "learning_rate": 0.00016932927870170244, "loss": 0.1695878028869629, "mean_token_accuracy": 0.962311232984066, "num_tokens": 9992411.0, "step": 9475 }, { "entropy": 0.11070761349285022, "epoch": 0.8377609735664366, "grad_norm": 5.28125, "learning_rate": 0.0001691575615670478, "loss": 0.1375373935699463, "mean_token_accuracy": 0.9715418082475662, "num_tokens": 10017700.0, "step": 9500 }, { "entropy": 0.11562138024251908, "epoch": 0.8399656077074009, "grad_norm": 6.5625, "learning_rate": 0.0001689854526745759, "loss": 0.15139179229736327, "mean_token_accuracy": 0.9657182011008263, "num_tokens": 10044558.0, "step": 9525 }, { "entropy": 0.11778819782077335, "epoch": 0.8421702418483653, "grad_norm": 2.75, "learning_rate": 0.00016881295299923444, "loss": 0.1503695774078369, "mean_token_accuracy": 0.9710509559512138, "num_tokens": 10071358.0, "step": 9550 }, { "entropy": 0.1135219477955252, "epoch": 0.8443748759893296, "grad_norm": 2.8125, "learning_rate": 0.00016864006351818473, "loss": 0.1394341278076172, "mean_token_accuracy": 0.9689531043171883, "num_tokens": 10096955.0, "step": 9575 }, { "entropy": 0.10299550660944078, "epoch": 0.8465795101302939, "grad_norm": 3.984375, "learning_rate": 0.00016846678521079627, "loss": 0.12168859481811524, "mean_token_accuracy": 0.9724226155877114, "num_tokens": 10123250.0, "step": 9600 }, { "entropy": 0.11091373673116323, "epoch": 0.8487841442712581, "grad_norm": 3.078125, "learning_rate": 0.00016829311905864114, "loss": 0.16308366775512695, "mean_token_accuracy": 0.967094420492649, "num_tokens": 10150855.0, "step": 9625 }, { "entropy": 0.11484327328740619, "epoch": 0.8509887784122225, "grad_norm": 5.40625, "learning_rate": 0.00016811906604548846, "loss": 0.14888720512390136, "mean_token_accuracy": 0.9673416930437088, "num_tokens": 10178188.0, "step": 9650 }, { "entropy": 0.12452099201036618, "epoch": 0.8531934125531868, "grad_norm": 4.53125, "learning_rate": 0.00016794462715729878, "loss": 0.16681873321533203, "mean_token_accuracy": 0.9647532105445862, "num_tokens": 10205220.0, "step": 9675 }, { "entropy": 0.0996164298261283, "epoch": 0.8553980466941511, "grad_norm": 3.9375, "learning_rate": 0.0001677698033822185, "loss": 0.1467697811126709, "mean_token_accuracy": 0.9715581625699997, "num_tokens": 10231496.0, "step": 9700 }, { "entropy": 0.10809524114127271, "epoch": 0.8576026808351154, "grad_norm": 4.625, "learning_rate": 0.00016759459571057445, "loss": 0.14575215339660644, "mean_token_accuracy": 0.9664224565029145, "num_tokens": 10257436.0, "step": 9725 }, { "entropy": 0.1113849281903822, "epoch": 0.8598073149760798, "grad_norm": 3.84375, "learning_rate": 0.0001674190051348679, "loss": 0.1565341281890869, "mean_token_accuracy": 0.9675761547684669, "num_tokens": 10283162.0, "step": 9750 }, { "entropy": 0.13911948825232684, "epoch": 0.862011949117044, "grad_norm": 4.96875, "learning_rate": 0.00016724303264976928, "loss": 0.20747381210327148, "mean_token_accuracy": 0.9587804532051086, "num_tokens": 10310772.0, "step": 9775 }, { "entropy": 0.09862252110615373, "epoch": 0.8642165832580083, "grad_norm": 3.984375, "learning_rate": 0.00016706667925211246, "loss": 0.11554966926574707, "mean_token_accuracy": 0.9711972030997277, "num_tokens": 10337443.0, "step": 9800 }, { "entropy": 0.12474286954733543, "epoch": 0.8664212173989726, "grad_norm": 3.578125, "learning_rate": 0.00016688994594088897, "loss": 0.16256399154663087, "mean_token_accuracy": 0.9645921823382377, "num_tokens": 10364440.0, "step": 9825 }, { "entropy": 0.11431661496520974, "epoch": 0.868625851539937, "grad_norm": 1.1484375, "learning_rate": 0.00016671283371724258, "loss": 0.17474599838256835, "mean_token_accuracy": 0.9650062435865402, "num_tokens": 10390382.0, "step": 9850 }, { "entropy": 0.11188884009723551, "epoch": 0.8708304856809013, "grad_norm": 5.40625, "learning_rate": 0.00016653534358446333, "loss": 0.16477123260498047, "mean_token_accuracy": 0.9655068427324295, "num_tokens": 10417334.0, "step": 9875 }, { "entropy": 0.13186935628706123, "epoch": 0.8730351198218655, "grad_norm": 4.0625, "learning_rate": 0.0001663574765479821, "loss": 0.16050004959106445, "mean_token_accuracy": 0.9649894621968269, "num_tokens": 10445100.0, "step": 9900 }, { "entropy": 0.09267763423384168, "epoch": 0.8752397539628298, "grad_norm": 3.03125, "learning_rate": 0.00016617923361536481, "loss": 0.12584033966064453, "mean_token_accuracy": 0.9726918703317642, "num_tokens": 10471396.0, "step": 9925 }, { "entropy": 0.10969781915540806, "epoch": 0.8774443881037942, "grad_norm": 4.28125, "learning_rate": 0.00016600061579630682, "loss": 0.1531198787689209, "mean_token_accuracy": 0.9685003688931465, "num_tokens": 10496896.0, "step": 9950 }, { "entropy": 0.10426673835027031, "epoch": 0.8796490222447585, "grad_norm": 2.71875, "learning_rate": 0.00016582162410262683, "loss": 0.141700439453125, "mean_token_accuracy": 0.9694043102860451, "num_tokens": 10524052.0, "step": 9975 }, { "entropy": 0.10985518074186984, "epoch": 0.8818536563857228, "grad_norm": 5.5625, "learning_rate": 0.0001656422595482618, "loss": 0.1478671932220459, "mean_token_accuracy": 0.9703094553947449, "num_tokens": 10550484.0, "step": 10000 }, { "epoch": 0.8818536563857228, "eval_entropy": 0.06747981444776195, "eval_loss": 0.06752390414476395, "eval_mean_token_accuracy": 0.9817088659272917, "eval_num_tokens": 10550484.0, "eval_runtime": 245.6253, "eval_samples_per_second": 15.996, "eval_steps_per_second": 4.002, "step": 10000 }, { "entropy": 0.12341946017812006, "epoch": 0.884058290526687, "grad_norm": 1.953125, "learning_rate": 0.0001654625231492605, "loss": 0.16616138458251953, "mean_token_accuracy": 0.967511080801487, "num_tokens": 10577372.0, "step": 10025 }, { "entropy": 0.13170893985428847, "epoch": 0.8862629246676514, "grad_norm": 3.640625, "learning_rate": 0.00016528241592377838, "loss": 0.19268482208251952, "mean_token_accuracy": 0.9656442356109619, "num_tokens": 10604954.0, "step": 10050 }, { "entropy": 0.10221802647865844, "epoch": 0.8884675588086157, "grad_norm": 5.0625, "learning_rate": 0.00016510193889207128, "loss": 0.14701667785644532, "mean_token_accuracy": 0.9697395420074463, "num_tokens": 10631480.0, "step": 10075 }, { "entropy": 0.11693994878907688, "epoch": 0.89067219294958, "grad_norm": 3.53125, "learning_rate": 0.00016492109307649012, "loss": 0.1495195198059082, "mean_token_accuracy": 0.9655201029777527, "num_tokens": 10657009.0, "step": 10100 }, { "entropy": 0.1266687322355574, "epoch": 0.8928768270905443, "grad_norm": 3.015625, "learning_rate": 0.00016473987950147464, "loss": 0.1930523109436035, "mean_token_accuracy": 0.9648179452121258, "num_tokens": 10684895.0, "step": 10125 }, { "entropy": 0.1290986440368579, "epoch": 0.8950814612315087, "grad_norm": 3.40625, "learning_rate": 0.00016455829919354798, "loss": 0.15923919677734374, "mean_token_accuracy": 0.9646768179535866, "num_tokens": 10712424.0, "step": 10150 }, { "entropy": 0.09042322851746576, "epoch": 0.897286095372473, "grad_norm": 4.28125, "learning_rate": 0.00016437635318131068, "loss": 0.1193246841430664, "mean_token_accuracy": 0.9700718694925308, "num_tokens": 10738436.0, "step": 10175 }, { "entropy": 0.11201601465581916, "epoch": 0.8994907295134372, "grad_norm": 6.5625, "learning_rate": 0.0001641940424954349, "loss": 0.1540191078186035, "mean_token_accuracy": 0.9686597174406052, "num_tokens": 10765461.0, "step": 10200 }, { "entropy": 0.11977429527789354, "epoch": 0.9016953636544015, "grad_norm": 2.703125, "learning_rate": 0.00016401136816865858, "loss": 0.1758272933959961, "mean_token_accuracy": 0.9664454838633537, "num_tokens": 10792485.0, "step": 10225 }, { "entropy": 0.1012404073565267, "epoch": 0.9038999977953659, "grad_norm": 1.5390625, "learning_rate": 0.0001638283312357795, "loss": 0.12225442886352539, "mean_token_accuracy": 0.9731503677368164, "num_tokens": 10817757.0, "step": 10250 }, { "entropy": 0.13167083914973773, "epoch": 0.9061046319363302, "grad_norm": 4.3125, "learning_rate": 0.0001636449327336496, "loss": 0.17429544448852538, "mean_token_accuracy": 0.9628306743502617, "num_tokens": 10845750.0, "step": 10275 }, { "entropy": 0.08767559766652994, "epoch": 0.9083092660772945, "grad_norm": 3.96875, "learning_rate": 0.00016346117370116898, "loss": 0.10843348503112793, "mean_token_accuracy": 0.9764333325624466, "num_tokens": 10871753.0, "step": 10300 }, { "entropy": 0.10268983447313076, "epoch": 0.9105139002182587, "grad_norm": 1.4921875, "learning_rate": 0.00016327705517927998, "loss": 0.15269672393798828, "mean_token_accuracy": 0.969476004242897, "num_tokens": 10897973.0, "step": 10325 }, { "entropy": 0.11594196151621873, "epoch": 0.9127185343592231, "grad_norm": 3.171875, "learning_rate": 0.0001630925782109615, "loss": 0.14017095565795898, "mean_token_accuracy": 0.9712727320194244, "num_tokens": 10925576.0, "step": 10350 }, { "entropy": 0.09633901939727366, "epoch": 0.9149231685001874, "grad_norm": 3.46875, "learning_rate": 0.0001629077438412227, "loss": 0.12564464569091796, "mean_token_accuracy": 0.9723188054561615, "num_tokens": 10953120.0, "step": 10375 }, { "entropy": 0.09749640888854628, "epoch": 0.9171278026411517, "grad_norm": 1.3125, "learning_rate": 0.00016272255311709755, "loss": 0.12157401084899902, "mean_token_accuracy": 0.9729098379611969, "num_tokens": 10980088.0, "step": 10400 }, { "entropy": 0.09341563728055917, "epoch": 0.919332436782116, "grad_norm": 3.640625, "learning_rate": 0.00016253700708763848, "loss": 0.12513632774353028, "mean_token_accuracy": 0.9720851957798005, "num_tokens": 11006112.0, "step": 10425 }, { "entropy": 0.11993184727121843, "epoch": 0.9215370709230803, "grad_norm": 4.375, "learning_rate": 0.0001623511068039108, "loss": 0.19810443878173828, "mean_token_accuracy": 0.963067757487297, "num_tokens": 11035186.0, "step": 10450 }, { "entropy": 0.10798994577955455, "epoch": 0.9237417050640446, "grad_norm": 1.0546875, "learning_rate": 0.00016216485331898642, "loss": 0.1386628818511963, "mean_token_accuracy": 0.969612886607647, "num_tokens": 11061651.0, "step": 10475 }, { "entropy": 0.11358262816502247, "epoch": 0.9259463392050089, "grad_norm": 1.2578125, "learning_rate": 0.0001619782476879381, "loss": 0.19460039138793944, "mean_token_accuracy": 0.966367999613285, "num_tokens": 11088569.0, "step": 10500 }, { "entropy": 0.1167042134446092, "epoch": 0.9281509733459732, "grad_norm": 5.78125, "learning_rate": 0.0001617912909678334, "loss": 0.15505099296569824, "mean_token_accuracy": 0.9686264345049858, "num_tokens": 11115599.0, "step": 10525 }, { "entropy": 0.1133784833503887, "epoch": 0.9303556074869376, "grad_norm": 2.28125, "learning_rate": 0.00016160398421772868, "loss": 0.1678891372680664, "mean_token_accuracy": 0.9675005570054054, "num_tokens": 11142314.0, "step": 10550 }, { "entropy": 0.09698103930742946, "epoch": 0.9325602416279019, "grad_norm": 3.21875, "learning_rate": 0.0001614163284986632, "loss": 0.14286434173583984, "mean_token_accuracy": 0.969823080599308, "num_tokens": 11168533.0, "step": 10575 }, { "entropy": 0.10122561583353672, "epoch": 0.9347648757688661, "grad_norm": 3.140625, "learning_rate": 0.0001612283248736529, "loss": 0.17256240844726561, "mean_token_accuracy": 0.969860375225544, "num_tokens": 11194503.0, "step": 10600 }, { "entropy": 0.10910014692519326, "epoch": 0.9369695099098304, "grad_norm": 1.1796875, "learning_rate": 0.00016103997440768456, "loss": 0.1422108840942383, "mean_token_accuracy": 0.9693578332662582, "num_tokens": 11219802.0, "step": 10625 }, { "entropy": 0.11992133250838378, "epoch": 0.9391741440507948, "grad_norm": 3.390625, "learning_rate": 0.0001608512781677098, "loss": 0.16429296493530274, "mean_token_accuracy": 0.9664942741394043, "num_tokens": 11248011.0, "step": 10650 }, { "entropy": 0.10503609779349063, "epoch": 0.9413787781917591, "grad_norm": 4.53125, "learning_rate": 0.00016066223722263883, "loss": 0.14775081634521484, "mean_token_accuracy": 0.9697583362460136, "num_tokens": 11275171.0, "step": 10675 }, { "entropy": 0.10717430006247014, "epoch": 0.9435834123327234, "grad_norm": 2.96875, "learning_rate": 0.00016047285264333462, "loss": 0.13536286354064941, "mean_token_accuracy": 0.9718959403038024, "num_tokens": 11302493.0, "step": 10700 }, { "entropy": 0.12358801149821375, "epoch": 0.9457880464736876, "grad_norm": 5.09375, "learning_rate": 0.00016028312550260657, "loss": 0.16921548843383788, "mean_token_accuracy": 0.9650797703862191, "num_tokens": 11330172.0, "step": 10725 }, { "entropy": 0.09451631994277704, "epoch": 0.947992680614652, "grad_norm": 0.84375, "learning_rate": 0.00016009305687520478, "loss": 0.13955138206481935, "mean_token_accuracy": 0.9695536935329437, "num_tokens": 11356828.0, "step": 10750 }, { "entropy": 0.09438925628404832, "epoch": 0.9501973147556163, "grad_norm": 5.03125, "learning_rate": 0.00015990264783781366, "loss": 0.13763263702392578, "mean_token_accuracy": 0.9733442470431328, "num_tokens": 11383487.0, "step": 10775 }, { "entropy": 0.10031276696827263, "epoch": 0.9524019488965806, "grad_norm": 2.65625, "learning_rate": 0.00015971189946904603, "loss": 0.11601199150085449, "mean_token_accuracy": 0.9713838744163513, "num_tokens": 11408411.0, "step": 10800 }, { "entropy": 0.08611906496749726, "epoch": 0.9546065830375449, "grad_norm": 3.390625, "learning_rate": 0.0001595208128494368, "loss": 0.12846466064453124, "mean_token_accuracy": 0.974527842104435, "num_tokens": 11434288.0, "step": 10825 }, { "entropy": 0.09868820410221815, "epoch": 0.9568112171785093, "grad_norm": 4.6875, "learning_rate": 0.0001593293890614371, "loss": 0.14539647102355957, "mean_token_accuracy": 0.9727686950564385, "num_tokens": 11461369.0, "step": 10850 }, { "entropy": 0.10726987033616751, "epoch": 0.9590158513194735, "grad_norm": 1.6328125, "learning_rate": 0.00015913762918940796, "loss": 0.15492965698242187, "mean_token_accuracy": 0.9678599080443382, "num_tokens": 11487252.0, "step": 10875 }, { "entropy": 0.11905311323818751, "epoch": 0.9612204854604378, "grad_norm": 3.375, "learning_rate": 0.0001589455343196142, "loss": 0.1703525733947754, "mean_token_accuracy": 0.9683392706513405, "num_tokens": 11513191.0, "step": 10900 }, { "entropy": 0.09980171727540438, "epoch": 0.9634251196014022, "grad_norm": 0.416015625, "learning_rate": 0.0001587531055402184, "loss": 0.140428409576416, "mean_token_accuracy": 0.9712198379635811, "num_tokens": 11538104.0, "step": 10925 }, { "entropy": 0.08444582270516549, "epoch": 0.9656297537423665, "grad_norm": 2.921875, "learning_rate": 0.00015856034394127458, "loss": 0.11888874053955079, "mean_token_accuracy": 0.9720085749030113, "num_tokens": 11563627.0, "step": 10950 }, { "entropy": 0.08891735835233704, "epoch": 0.9678343878833308, "grad_norm": 5.6875, "learning_rate": 0.00015836725061472206, "loss": 0.11186134338378906, "mean_token_accuracy": 0.9750255295634269, "num_tokens": 11588193.0, "step": 10975 }, { "entropy": 0.0976211961361696, "epoch": 0.970039022024295, "grad_norm": 5.65625, "learning_rate": 0.00015817382665437934, "loss": 0.13658031463623047, "mean_token_accuracy": 0.9715496882796287, "num_tokens": 11613550.0, "step": 11000 }, { "epoch": 0.970039022024295, "eval_entropy": 0.04970914650081239, "eval_loss": 0.05864229425787926, "eval_mean_token_accuracy": 0.9834782193175168, "eval_num_tokens": 11613550.0, "eval_runtime": 244.2638, "eval_samples_per_second": 16.085, "eval_steps_per_second": 4.024, "step": 11000 }, { "entropy": 0.09031637163367122, "epoch": 0.9722436561652594, "grad_norm": 6.5, "learning_rate": 0.00015798007315593793, "loss": 0.13898273468017577, "mean_token_accuracy": 0.973123728632927, "num_tokens": 11640911.0, "step": 11025 }, { "entropy": 0.10300864099699539, "epoch": 0.9744482903062237, "grad_norm": 1.9140625, "learning_rate": 0.00015778599121695592, "loss": 0.14298954963684082, "mean_token_accuracy": 0.9674796098470688, "num_tokens": 11667215.0, "step": 11050 }, { "entropy": 0.0918053998821415, "epoch": 0.976652924447188, "grad_norm": 2.53125, "learning_rate": 0.0001575915819368521, "loss": 0.1144057846069336, "mean_token_accuracy": 0.9736630937457085, "num_tokens": 11693740.0, "step": 11075 }, { "entropy": 0.0991861937311478, "epoch": 0.9788575585881523, "grad_norm": 3.25, "learning_rate": 0.0001573968464168994, "loss": 0.1382896327972412, "mean_token_accuracy": 0.9709669390320778, "num_tokens": 11721792.0, "step": 11100 }, { "entropy": 0.10513557809928897, "epoch": 0.9810621927291167, "grad_norm": 1.2421875, "learning_rate": 0.00015720178576021892, "loss": 0.14281551361083986, "mean_token_accuracy": 0.9703371456265449, "num_tokens": 11748074.0, "step": 11125 }, { "entropy": 0.07274828981404426, "epoch": 0.9832668268700809, "grad_norm": 3.640625, "learning_rate": 0.0001570064010717735, "loss": 0.08963616371154785, "mean_token_accuracy": 0.9772802451252938, "num_tokens": 11774661.0, "step": 11150 }, { "entropy": 0.09237290589720942, "epoch": 0.9854714610110452, "grad_norm": 0.8515625, "learning_rate": 0.00015681069345836157, "loss": 0.12891390800476074, "mean_token_accuracy": 0.9716602417826653, "num_tokens": 11802101.0, "step": 11175 }, { "entropy": 0.10531749608111568, "epoch": 0.9876760951520095, "grad_norm": 1.6953125, "learning_rate": 0.0001566146640286108, "loss": 0.16108951568603516, "mean_token_accuracy": 0.9681805393099785, "num_tokens": 11828012.0, "step": 11200 }, { "entropy": 0.10035160956729669, "epoch": 0.9898807292929739, "grad_norm": 3.578125, "learning_rate": 0.00015641831389297188, "loss": 0.1179033374786377, "mean_token_accuracy": 0.9694277629256248, "num_tokens": 11854834.0, "step": 11225 }, { "entropy": 0.10796484625694575, "epoch": 0.9920853634339382, "grad_norm": 3.46875, "learning_rate": 0.00015622164416371218, "loss": 0.16446849822998047, "mean_token_accuracy": 0.9640506872534752, "num_tokens": 11882106.0, "step": 11250 }, { "entropy": 0.1411357653059531, "epoch": 0.9942899975749024, "grad_norm": 1.1484375, "learning_rate": 0.00015602465595490953, "loss": 0.19301389694213866, "mean_token_accuracy": 0.9607040500640869, "num_tokens": 11909716.0, "step": 11275 }, { "entropy": 0.0925849473947892, "epoch": 0.9964946317158667, "grad_norm": 2.015625, "learning_rate": 0.00015582735038244575, "loss": 0.1028307056427002, "mean_token_accuracy": 0.9762453472614289, "num_tokens": 11935816.0, "step": 11300 }, { "entropy": 0.08189033437593025, "epoch": 0.9986992658568311, "grad_norm": 2.296875, "learning_rate": 0.00015562972856400062, "loss": 0.1367438793182373, "mean_token_accuracy": 0.972346597313881, "num_tokens": 11961616.0, "step": 11325 }, { "entropy": 0.11836909589115437, "epoch": 1.0008818536563857, "grad_norm": 2.15625, "learning_rate": 0.00015543179161904517, "loss": 0.1542993450164795, "mean_token_accuracy": 0.9657085614674019, "num_tokens": 11987237.0, "step": 11350 }, { "entropy": 0.08593971867026994, "epoch": 1.00308648779735, "grad_norm": 3.640625, "learning_rate": 0.00015523354066883567, "loss": 0.1126481533050537, "mean_token_accuracy": 0.97596212297678, "num_tokens": 12013371.0, "step": 11375 }, { "entropy": 0.07548106749542058, "epoch": 1.0052911219383143, "grad_norm": 2.765625, "learning_rate": 0.000155034976836407, "loss": 0.11987373352050781, "mean_token_accuracy": 0.9775131213665008, "num_tokens": 12039685.0, "step": 11400 }, { "entropy": 0.07430520750611322, "epoch": 1.0074957560792785, "grad_norm": 5.3125, "learning_rate": 0.0001548361012465666, "loss": 0.10026315689086913, "mean_token_accuracy": 0.9790066388249398, "num_tokens": 12065476.0, "step": 11425 }, { "entropy": 0.07392804217728553, "epoch": 1.009700390220243, "grad_norm": 2.984375, "learning_rate": 0.00015463691502588778, "loss": 0.1030975341796875, "mean_token_accuracy": 0.9764667934179306, "num_tokens": 12092089.0, "step": 11450 }, { "entropy": 0.0857640357335913, "epoch": 1.0119050243612073, "grad_norm": 1.890625, "learning_rate": 0.00015443741930270353, "loss": 0.11760306358337402, "mean_token_accuracy": 0.9736816781759262, "num_tokens": 12119248.0, "step": 11475 }, { "entropy": 0.09214958660071716, "epoch": 1.0141096585021716, "grad_norm": 4.0625, "learning_rate": 0.00015423761520710015, "loss": 0.12875985145568847, "mean_token_accuracy": 0.9710692197084427, "num_tokens": 12146638.0, "step": 11500 }, { "entropy": 0.09558408751501701, "epoch": 1.0163142926431359, "grad_norm": 0.99609375, "learning_rate": 0.00015403750387091072, "loss": 0.11520135879516602, "mean_token_accuracy": 0.9736033695936203, "num_tokens": 12174242.0, "step": 11525 }, { "entropy": 0.08030253475852078, "epoch": 1.0185189267841002, "grad_norm": 1.7265625, "learning_rate": 0.0001538370864277087, "loss": 0.109821138381958, "mean_token_accuracy": 0.97515142172575, "num_tokens": 12201776.0, "step": 11550 }, { "entropy": 0.10750156398629769, "epoch": 1.0207235609250644, "grad_norm": 5.6875, "learning_rate": 0.00015363636401280166, "loss": 0.16224815368652343, "mean_token_accuracy": 0.9691583275794983, "num_tokens": 12229652.0, "step": 11575 }, { "entropy": 0.08844495263532735, "epoch": 1.0229281950660287, "grad_norm": 2.765625, "learning_rate": 0.00015343533776322467, "loss": 0.11790482521057129, "mean_token_accuracy": 0.9742652297019958, "num_tokens": 12256478.0, "step": 11600 }, { "entropy": 0.08379628753580619, "epoch": 1.025132829206993, "grad_norm": 13.25, "learning_rate": 0.000153234008817734, "loss": 0.12678858757019043, "mean_token_accuracy": 0.9716950944066047, "num_tokens": 12283290.0, "step": 11625 }, { "entropy": 0.0752360312268138, "epoch": 1.0273374633479575, "grad_norm": 2.140625, "learning_rate": 0.00015303237831680057, "loss": 0.10330279350280762, "mean_token_accuracy": 0.9772691050171852, "num_tokens": 12309083.0, "step": 11650 }, { "entropy": 0.10299436590634287, "epoch": 1.0295420974889218, "grad_norm": 1.6171875, "learning_rate": 0.00015283044740260358, "loss": 0.13117281913757325, "mean_token_accuracy": 0.9719769659638405, "num_tokens": 12335892.0, "step": 11675 }, { "entropy": 0.08254285944451112, "epoch": 1.031746731629886, "grad_norm": 3.3125, "learning_rate": 0.00015262821721902383, "loss": 0.1227413558959961, "mean_token_accuracy": 0.9741689735651016, "num_tokens": 12361808.0, "step": 11700 }, { "entropy": 0.08000514879269759, "epoch": 1.0339513657708503, "grad_norm": 3.5625, "learning_rate": 0.00015242568891163757, "loss": 0.10524334907531738, "mean_token_accuracy": 0.977083123922348, "num_tokens": 12388433.0, "step": 11725 }, { "entropy": 0.07193909519643057, "epoch": 1.0361559999118146, "grad_norm": 4.40625, "learning_rate": 0.00015222286362770975, "loss": 0.09867951393127442, "mean_token_accuracy": 0.9776046532392502, "num_tokens": 12415363.0, "step": 11750 }, { "entropy": 0.06196809031680459, "epoch": 1.038360634052779, "grad_norm": 3.4375, "learning_rate": 0.00015201974251618765, "loss": 0.09109333992004394, "mean_token_accuracy": 0.9808389428257942, "num_tokens": 12441196.0, "step": 11775 }, { "entropy": 0.07480443540145643, "epoch": 1.0405652681937432, "grad_norm": 1.8125, "learning_rate": 0.00015181632672769428, "loss": 0.1080815315246582, "mean_token_accuracy": 0.9766770607233047, "num_tokens": 12466919.0, "step": 11800 }, { "entropy": 0.07268392765254247, "epoch": 1.0427699023347075, "grad_norm": 3.765625, "learning_rate": 0.0001516126174145219, "loss": 0.11195050239562988, "mean_token_accuracy": 0.9795046743750572, "num_tokens": 12492043.0, "step": 11825 }, { "entropy": 0.07267954504699446, "epoch": 1.044974536475672, "grad_norm": 3.5, "learning_rate": 0.0001514086157306256, "loss": 0.08853129386901855, "mean_token_accuracy": 0.9804096031188965, "num_tokens": 12516405.0, "step": 11850 }, { "entropy": 0.07731569488940294, "epoch": 1.0471791706166362, "grad_norm": 3.640625, "learning_rate": 0.00015120432283161658, "loss": 0.10150125503540039, "mean_token_accuracy": 0.9787563198804855, "num_tokens": 12543253.0, "step": 11875 }, { "entropy": 0.07172672248416348, "epoch": 1.0493838047576005, "grad_norm": 1.7890625, "learning_rate": 0.0001509997398747557, "loss": 0.10923091888427734, "mean_token_accuracy": 0.9780178043246269, "num_tokens": 12569500.0, "step": 11900 }, { "entropy": 0.07830634732585168, "epoch": 1.0515884388985648, "grad_norm": 4.0625, "learning_rate": 0.00015079486801894697, "loss": 0.1168800163269043, "mean_token_accuracy": 0.9760348123311996, "num_tokens": 12595946.0, "step": 11925 }, { "entropy": 0.08794710203001159, "epoch": 1.053793073039529, "grad_norm": 2.4375, "learning_rate": 0.00015058970842473087, "loss": 0.11576443672180176, "mean_token_accuracy": 0.9739569133520126, "num_tokens": 12622062.0, "step": 11950 }, { "entropy": 0.0924138590262737, "epoch": 1.0559977071804934, "grad_norm": 4.03125, "learning_rate": 0.0001503842622542778, "loss": 0.12705818176269532, "mean_token_accuracy": 0.9734925562143326, "num_tokens": 12648382.0, "step": 11975 }, { "entropy": 0.08148457858012989, "epoch": 1.0582023413214576, "grad_norm": 1.65625, "learning_rate": 0.0001501785306713817, "loss": 0.10386928558349609, "mean_token_accuracy": 0.9757542279362679, "num_tokens": 12674085.0, "step": 12000 }, { "epoch": 1.0582023413214576, "eval_entropy": 0.046589221632191466, "eval_loss": 0.05319363623857498, "eval_mean_token_accuracy": 0.9850547621718259, "eval_num_tokens": 12674085.0, "eval_runtime": 245.4491, "eval_samples_per_second": 16.007, "eval_steps_per_second": 4.005, "step": 12000 }, { "entropy": 0.07984354873478879, "epoch": 1.060406975462422, "grad_norm": 1.5859375, "learning_rate": 0.00014997251484145306, "loss": 0.1102367115020752, "mean_token_accuracy": 0.9725632515549659, "num_tokens": 12700426.0, "step": 12025 }, { "entropy": 0.07065878395733308, "epoch": 1.0626116096033864, "grad_norm": 2.828125, "learning_rate": 0.00014976621593151276, "loss": 0.0880131721496582, "mean_token_accuracy": 0.9790845555067063, "num_tokens": 12725972.0, "step": 12050 }, { "entropy": 0.06189297338773031, "epoch": 1.0648162437443507, "grad_norm": 1.4375, "learning_rate": 0.0001495596351101851, "loss": 0.08140110969543457, "mean_token_accuracy": 0.9800572353601456, "num_tokens": 12751452.0, "step": 12075 }, { "entropy": 0.08883357329294085, "epoch": 1.067020877885315, "grad_norm": 4.21875, "learning_rate": 0.0001493527735476914, "loss": 0.14287374496459962, "mean_token_accuracy": 0.9703152641654015, "num_tokens": 12779942.0, "step": 12100 }, { "entropy": 0.11493293149920646, "epoch": 1.0692255120262792, "grad_norm": 4.3125, "learning_rate": 0.00014914563241584324, "loss": 0.14303622245788575, "mean_token_accuracy": 0.9675487235188485, "num_tokens": 12807288.0, "step": 12125 }, { "entropy": 0.07454075408430072, "epoch": 1.0714301461672435, "grad_norm": 2.125, "learning_rate": 0.000148938212888036, "loss": 0.0955147647857666, "mean_token_accuracy": 0.975711221396923, "num_tokens": 12833484.0, "step": 12150 }, { "entropy": 0.09217336674570106, "epoch": 1.0736347803082078, "grad_norm": 0.283203125, "learning_rate": 0.00014873051613924194, "loss": 0.12428475379943847, "mean_token_accuracy": 0.9730561417341232, "num_tokens": 12859854.0, "step": 12175 }, { "entropy": 0.07789780603197868, "epoch": 1.075839414449172, "grad_norm": 1.5859375, "learning_rate": 0.0001485225433460038, "loss": 0.10363780975341796, "mean_token_accuracy": 0.975709747672081, "num_tokens": 12885848.0, "step": 12200 }, { "entropy": 0.08167748826730531, "epoch": 1.0780440485901366, "grad_norm": 4.78125, "learning_rate": 0.00014831429568642798, "loss": 0.1125030517578125, "mean_token_accuracy": 0.9740335485339164, "num_tokens": 12913035.0, "step": 12225 }, { "entropy": 0.08957995633652899, "epoch": 1.0802486827311009, "grad_norm": 1.78125, "learning_rate": 0.00014810577434017802, "loss": 0.13156410217285155, "mean_token_accuracy": 0.9731645616889, "num_tokens": 12940009.0, "step": 12250 }, { "entropy": 0.0830923686773167, "epoch": 1.0824533168720651, "grad_norm": 4.53125, "learning_rate": 0.00014789698048846766, "loss": 0.10087099075317382, "mean_token_accuracy": 0.9760751956701279, "num_tokens": 12966012.0, "step": 12275 }, { "entropy": 0.061791106628807026, "epoch": 1.0846579510130294, "grad_norm": 2.625, "learning_rate": 0.0001476879153140544, "loss": 0.09330236434936523, "mean_token_accuracy": 0.9777177131175995, "num_tokens": 12991480.0, "step": 12300 }, { "entropy": 0.07935161782981595, "epoch": 1.0868625851539937, "grad_norm": 3.078125, "learning_rate": 0.00014747858000123268, "loss": 0.10592585563659668, "mean_token_accuracy": 0.977505379319191, "num_tokens": 13017848.0, "step": 12325 }, { "entropy": 0.08796740350895561, "epoch": 1.089067219294958, "grad_norm": 2.125, "learning_rate": 0.00014726897573582725, "loss": 0.12587616920471192, "mean_token_accuracy": 0.9732900467514992, "num_tokens": 13044023.0, "step": 12350 }, { "entropy": 0.07419176746087032, "epoch": 1.0912718534359223, "grad_norm": 0.671875, "learning_rate": 0.00014705910370518637, "loss": 0.09034128189086914, "mean_token_accuracy": 0.9786489099264145, "num_tokens": 13071137.0, "step": 12375 }, { "entropy": 0.0814774570602458, "epoch": 1.0934764875768865, "grad_norm": 3.40625, "learning_rate": 0.00014684896509817503, "loss": 0.12874791145324707, "mean_token_accuracy": 0.9732303580641747, "num_tokens": 13098158.0, "step": 12400 }, { "entropy": 0.0893553533218801, "epoch": 1.0956811217178508, "grad_norm": 2.84375, "learning_rate": 0.0001466385611051684, "loss": 0.12181337356567383, "mean_token_accuracy": 0.9726255944371224, "num_tokens": 13124123.0, "step": 12425 }, { "entropy": 0.08096294098184444, "epoch": 1.0978857558588153, "grad_norm": 2.765625, "learning_rate": 0.00014642789291804495, "loss": 0.10353910446166992, "mean_token_accuracy": 0.9766229891777038, "num_tokens": 13151204.0, "step": 12450 }, { "entropy": 0.06782732794832555, "epoch": 1.1000903899997796, "grad_norm": 0.7890625, "learning_rate": 0.00014621696173017977, "loss": 0.09019325256347656, "mean_token_accuracy": 0.9796833488345146, "num_tokens": 13177694.0, "step": 12475 }, { "entropy": 0.07130046323902206, "epoch": 1.1022950241407439, "grad_norm": 1.796875, "learning_rate": 0.00014600576873643773, "loss": 0.09772644996643066, "mean_token_accuracy": 0.9763439601659775, "num_tokens": 13205202.0, "step": 12500 }, { "entropy": 0.06136355609982275, "epoch": 1.1044996582817082, "grad_norm": 2.6875, "learning_rate": 0.00014579431513316672, "loss": 0.07834446430206299, "mean_token_accuracy": 0.9810146534442902, "num_tokens": 13230360.0, "step": 12525 }, { "entropy": 0.06879065922315931, "epoch": 1.1067042924226724, "grad_norm": 0.9921875, "learning_rate": 0.00014558260211819106, "loss": 0.11468278884887695, "mean_token_accuracy": 0.9763934463262558, "num_tokens": 13256900.0, "step": 12550 }, { "entropy": 0.07944254042347892, "epoch": 1.1089089265636367, "grad_norm": 7.125, "learning_rate": 0.00014537063089080436, "loss": 0.10132018089294434, "mean_token_accuracy": 0.9765703016519547, "num_tokens": 13283731.0, "step": 12575 }, { "entropy": 0.07877481998410076, "epoch": 1.111113560704601, "grad_norm": 6.0625, "learning_rate": 0.00014515840265176308, "loss": 0.10516364097595216, "mean_token_accuracy": 0.9745470371842384, "num_tokens": 13309321.0, "step": 12600 }, { "entropy": 0.0723288451100234, "epoch": 1.1133181948455655, "grad_norm": 1.890625, "learning_rate": 0.00014494591860327952, "loss": 0.09705366134643555, "mean_token_accuracy": 0.978722071647644, "num_tokens": 13336571.0, "step": 12625 }, { "entropy": 0.07894025734014576, "epoch": 1.1155228289865298, "grad_norm": 2.46875, "learning_rate": 0.00014473317994901508, "loss": 0.1049672794342041, "mean_token_accuracy": 0.9736948177218437, "num_tokens": 13361742.0, "step": 12650 }, { "entropy": 0.06260268139514664, "epoch": 1.117727463127494, "grad_norm": 1.09375, "learning_rate": 0.0001445201878940734, "loss": 0.08412216186523437, "mean_token_accuracy": 0.9800903624296189, "num_tokens": 13387859.0, "step": 12675 }, { "entropy": 0.0810048793273745, "epoch": 1.1199320972684583, "grad_norm": 4.0625, "learning_rate": 0.00014430694364499363, "loss": 0.13930511474609375, "mean_token_accuracy": 0.9736354485154152, "num_tokens": 13414615.0, "step": 12700 }, { "entropy": 0.08986480454841513, "epoch": 1.1221367314094226, "grad_norm": 1.984375, "learning_rate": 0.00014409344840974345, "loss": 0.12213245391845703, "mean_token_accuracy": 0.9764112460613251, "num_tokens": 13441875.0, "step": 12725 }, { "entropy": 0.06098911547655007, "epoch": 1.1243413655503869, "grad_norm": 2.96875, "learning_rate": 0.00014387970339771236, "loss": 0.08347887992858886, "mean_token_accuracy": 0.9798440513014793, "num_tokens": 13468080.0, "step": 12750 }, { "entropy": 0.07254652585543227, "epoch": 1.1265459996913512, "grad_norm": 1.6171875, "learning_rate": 0.00014366570981970468, "loss": 0.1049235725402832, "mean_token_accuracy": 0.9728858286142349, "num_tokens": 13493997.0, "step": 12775 }, { "entropy": 0.08353760754165705, "epoch": 1.1287506338323154, "grad_norm": 4.0, "learning_rate": 0.000143451468887933, "loss": 0.1097674560546875, "mean_token_accuracy": 0.972501235306263, "num_tokens": 13519838.0, "step": 12800 }, { "entropy": 0.08435321261407808, "epoch": 1.1309552679732797, "grad_norm": 4.3125, "learning_rate": 0.00014323698181601085, "loss": 0.13715192794799805, "mean_token_accuracy": 0.9768632537126541, "num_tokens": 13546798.0, "step": 12825 }, { "entropy": 0.10204421273374464, "epoch": 1.1331599021142442, "grad_norm": 2.984375, "learning_rate": 0.00014302224981894616, "loss": 0.12858396530151367, "mean_token_accuracy": 0.9731094029545784, "num_tokens": 13573745.0, "step": 12850 }, { "entropy": 0.06739531061204616, "epoch": 1.1353645362552085, "grad_norm": 2.46875, "learning_rate": 0.0001428072741131344, "loss": 0.096489896774292, "mean_token_accuracy": 0.9783440440893173, "num_tokens": 13599946.0, "step": 12875 }, { "entropy": 0.09769865162845236, "epoch": 1.1375691703961728, "grad_norm": 2.109375, "learning_rate": 0.00014259205591635144, "loss": 0.1369704532623291, "mean_token_accuracy": 0.9713424414396286, "num_tokens": 13627369.0, "step": 12900 }, { "entropy": 0.07299916661781027, "epoch": 1.139773804537137, "grad_norm": 4.75, "learning_rate": 0.00014237659644774684, "loss": 0.08844392776489257, "mean_token_accuracy": 0.9792978599667549, "num_tokens": 13653006.0, "step": 12925 }, { "entropy": 0.068821275804803, "epoch": 1.1419784386781013, "grad_norm": 2.296875, "learning_rate": 0.00014216089692783694, "loss": 0.08574678421020508, "mean_token_accuracy": 0.9801573866605758, "num_tokens": 13679808.0, "step": 12950 }, { "entropy": 0.057801376127754336, "epoch": 1.1441830728190656, "grad_norm": 4.15625, "learning_rate": 0.00014194495857849782, "loss": 0.09204750061035157, "mean_token_accuracy": 0.9782607558369637, "num_tokens": 13705484.0, "step": 12975 }, { "entropy": 0.06144713588932063, "epoch": 1.14638770696003, "grad_norm": 2.5, "learning_rate": 0.00014172878262295853, "loss": 0.0980979061126709, "mean_token_accuracy": 0.978396021425724, "num_tokens": 13731153.0, "step": 13000 }, { "epoch": 1.14638770696003, "eval_entropy": 0.040426121556151766, "eval_loss": 0.04483678936958313, "eval_mean_token_accuracy": 0.987208288053045, "eval_num_tokens": 13731153.0, "eval_runtime": 248.9407, "eval_samples_per_second": 15.783, "eval_steps_per_second": 3.949, "step": 13000 }, { "entropy": 0.07772521212056745, "epoch": 1.1485923411009944, "grad_norm": 2.5, "learning_rate": 0.00014151237028579402, "loss": 0.09974138259887695, "mean_token_accuracy": 0.9792518520355225, "num_tokens": 13756932.0, "step": 13025 }, { "entropy": 0.08041290965047665, "epoch": 1.1507969752419587, "grad_norm": 2.796875, "learning_rate": 0.00014129572279291837, "loss": 0.12203351974487305, "mean_token_accuracy": 0.9737589022517205, "num_tokens": 13783460.0, "step": 13050 }, { "entropy": 0.08667991714755771, "epoch": 1.153001609382923, "grad_norm": 3.5625, "learning_rate": 0.00014107884137157768, "loss": 0.12619853019714355, "mean_token_accuracy": 0.969507671892643, "num_tokens": 13810053.0, "step": 13075 }, { "entropy": 0.07824131268818746, "epoch": 1.1552062435238872, "grad_norm": 1.28125, "learning_rate": 0.00014086172725034316, "loss": 0.08437572479248047, "mean_token_accuracy": 0.9793237087130546, "num_tokens": 13835469.0, "step": 13100 }, { "entropy": 0.07533937879052247, "epoch": 1.1574108776648515, "grad_norm": 4.6875, "learning_rate": 0.00014064438165910432, "loss": 0.1265271282196045, "mean_token_accuracy": 0.9731573021411896, "num_tokens": 13862336.0, "step": 13125 }, { "entropy": 0.07942030368372797, "epoch": 1.1596155118058158, "grad_norm": 3.1875, "learning_rate": 0.00014042680582906176, "loss": 0.1086429500579834, "mean_token_accuracy": 0.9767365399003028, "num_tokens": 13888059.0, "step": 13150 }, { "entropy": 0.064769710698165, "epoch": 1.16182014594678, "grad_norm": 0.62109375, "learning_rate": 0.00014020900099272036, "loss": 0.08750583648681641, "mean_token_accuracy": 0.9795074462890625, "num_tokens": 13912788.0, "step": 13175 }, { "entropy": 0.07411962449958082, "epoch": 1.1640247800877443, "grad_norm": 3.421875, "learning_rate": 0.00013999096838388227, "loss": 0.09150676727294922, "mean_token_accuracy": 0.9767512452602386, "num_tokens": 13939505.0, "step": 13200 }, { "entropy": 0.07226109138398897, "epoch": 1.1662294142287086, "grad_norm": 2.46875, "learning_rate": 0.0001397727092376399, "loss": 0.10258906364440917, "mean_token_accuracy": 0.9773488900065422, "num_tokens": 13967074.0, "step": 13225 }, { "entropy": 0.06648587315896293, "epoch": 1.1684340483696731, "grad_norm": 6.71875, "learning_rate": 0.0001395542247903689, "loss": 0.0932936954498291, "mean_token_accuracy": 0.980256330370903, "num_tokens": 13993453.0, "step": 13250 }, { "entropy": 0.07006794507964514, "epoch": 1.1706386825106374, "grad_norm": 2.375, "learning_rate": 0.00013933551627972124, "loss": 0.08853204727172852, "mean_token_accuracy": 0.9816073548793792, "num_tokens": 14019777.0, "step": 13275 }, { "entropy": 0.06473562117433175, "epoch": 1.1728433166516017, "grad_norm": 5.4375, "learning_rate": 0.00013911658494461808, "loss": 0.08555278778076172, "mean_token_accuracy": 0.9794245132803917, "num_tokens": 14046099.0, "step": 13300 }, { "entropy": 0.057029941927758045, "epoch": 1.175047950792566, "grad_norm": 2.15625, "learning_rate": 0.0001388974320252429, "loss": 0.08000053405761719, "mean_token_accuracy": 0.9806934934854508, "num_tokens": 14071432.0, "step": 13325 }, { "entropy": 0.059837884416338055, "epoch": 1.1772525849335302, "grad_norm": 2.5625, "learning_rate": 0.00013867805876303425, "loss": 0.09512213706970214, "mean_token_accuracy": 0.9803010419011116, "num_tokens": 14097921.0, "step": 13350 }, { "entropy": 0.08421937897976023, "epoch": 1.1794572190744945, "grad_norm": 2.015625, "learning_rate": 0.00013845846640067906, "loss": 0.13767002105712892, "mean_token_accuracy": 0.9754521402716637, "num_tokens": 14123607.0, "step": 13375 }, { "entropy": 0.07827224752778421, "epoch": 1.1816618532154588, "grad_norm": 2.453125, "learning_rate": 0.00013823865618210523, "loss": 0.09929595947265625, "mean_token_accuracy": 0.9765499302744866, "num_tokens": 14150885.0, "step": 13400 }, { "entropy": 0.07516255997674307, "epoch": 1.1838664873564233, "grad_norm": 4.8125, "learning_rate": 0.00013801862935247484, "loss": 0.09415432929992676, "mean_token_accuracy": 0.9775673219561577, "num_tokens": 14177368.0, "step": 13425 }, { "entropy": 0.08141786261985544, "epoch": 1.1860711214973876, "grad_norm": 2.296875, "learning_rate": 0.00013779838715817695, "loss": 0.11828108787536622, "mean_token_accuracy": 0.9732054328918457, "num_tokens": 14203895.0, "step": 13450 }, { "entropy": 0.06786765140190255, "epoch": 1.1882757556383519, "grad_norm": 2.0, "learning_rate": 0.00013757793084682066, "loss": 0.0916921329498291, "mean_token_accuracy": 0.979742522239685, "num_tokens": 14229737.0, "step": 13475 }, { "entropy": 0.06600234019482741, "epoch": 1.1904803897793161, "grad_norm": 2.5625, "learning_rate": 0.00013735726166722799, "loss": 0.08265236854553222, "mean_token_accuracy": 0.9800075829029083, "num_tokens": 14255643.0, "step": 13500 }, { "entropy": 0.06735640317740035, "epoch": 1.1926850239202804, "grad_norm": 4.75, "learning_rate": 0.00013713638086942668, "loss": 0.08291936874389648, "mean_token_accuracy": 0.9814954048395157, "num_tokens": 14279948.0, "step": 13525 }, { "entropy": 0.06462765499542002, "epoch": 1.1948896580612447, "grad_norm": 5.625, "learning_rate": 0.00013691528970464334, "loss": 0.09613308906555176, "mean_token_accuracy": 0.9818469232320786, "num_tokens": 14306310.0, "step": 13550 }, { "entropy": 0.06327619094226974, "epoch": 1.197094292202209, "grad_norm": 5.40625, "learning_rate": 0.00013669398942529627, "loss": 0.09374700546264648, "mean_token_accuracy": 0.9792777088284492, "num_tokens": 14331413.0, "step": 13575 }, { "entropy": 0.07611146297247615, "epoch": 1.1992989263431733, "grad_norm": 1.4375, "learning_rate": 0.0001364724812849882, "loss": 0.10114512443542481, "mean_token_accuracy": 0.97606270134449, "num_tokens": 14358067.0, "step": 13600 }, { "entropy": 0.06686223473065184, "epoch": 1.2015035604841375, "grad_norm": 3.3125, "learning_rate": 0.00013625076653849956, "loss": 0.10389527320861816, "mean_token_accuracy": 0.9748416700959206, "num_tokens": 14383570.0, "step": 13625 }, { "entropy": 0.09236453044810332, "epoch": 1.203708194625102, "grad_norm": 2.375, "learning_rate": 0.00013602884644178088, "loss": 0.11577310562133789, "mean_token_accuracy": 0.9753095716238022, "num_tokens": 14410596.0, "step": 13650 }, { "entropy": 0.06537643666742951, "epoch": 1.2059128287660663, "grad_norm": 2.265625, "learning_rate": 0.00013580672225194614, "loss": 0.07989202499389648, "mean_token_accuracy": 0.9799061450362205, "num_tokens": 14435854.0, "step": 13675 }, { "entropy": 0.055518159719649705, "epoch": 1.2081174629070306, "grad_norm": 0.765625, "learning_rate": 0.00013558439522726534, "loss": 0.07418478488922119, "mean_token_accuracy": 0.9806128066778182, "num_tokens": 14460482.0, "step": 13700 }, { "entropy": 0.08347531014704146, "epoch": 1.2103220970479949, "grad_norm": 1.140625, "learning_rate": 0.00013536186662715756, "loss": 0.10274332046508788, "mean_token_accuracy": 0.9763949981331825, "num_tokens": 14486796.0, "step": 13725 }, { "entropy": 0.07670228075119667, "epoch": 1.2125267311889592, "grad_norm": 1.828125, "learning_rate": 0.0001351391377121837, "loss": 0.1251106834411621, "mean_token_accuracy": 0.9748952552676201, "num_tokens": 14515063.0, "step": 13750 }, { "entropy": 0.07520388591961819, "epoch": 1.2147313653299234, "grad_norm": 3.296875, "learning_rate": 0.0001349162097440394, "loss": 0.13189183235168456, "mean_token_accuracy": 0.9732701346278191, "num_tokens": 14540360.0, "step": 13775 }, { "entropy": 0.06956776957900729, "epoch": 1.2169359994708877, "grad_norm": 7.84375, "learning_rate": 0.00013469308398554778, "loss": 0.09446800231933594, "mean_token_accuracy": 0.9780878749489784, "num_tokens": 14566124.0, "step": 13800 }, { "entropy": 0.07163769841310569, "epoch": 1.2191406336118522, "grad_norm": 3.53125, "learning_rate": 0.00013446976170065263, "loss": 0.09359949111938476, "mean_token_accuracy": 0.9788430881500244, "num_tokens": 14591871.0, "step": 13825 }, { "entropy": 0.07607505476626102, "epoch": 1.2213452677528165, "grad_norm": 1.328125, "learning_rate": 0.00013424624415441077, "loss": 0.10732210159301758, "mean_token_accuracy": 0.9768683615326882, "num_tokens": 14618746.0, "step": 13850 }, { "entropy": 0.07799635212053545, "epoch": 1.2235499018937808, "grad_norm": 2.296875, "learning_rate": 0.00013402253261298524, "loss": 0.10910382270812988, "mean_token_accuracy": 0.9741278582811356, "num_tokens": 14645850.0, "step": 13875 }, { "entropy": 0.0740620466673863, "epoch": 1.225754536034745, "grad_norm": 1.71875, "learning_rate": 0.00013379862834363797, "loss": 0.10109277725219727, "mean_token_accuracy": 0.9789579641819001, "num_tokens": 14672531.0, "step": 13900 }, { "entropy": 0.07713249627413461, "epoch": 1.2279591701757093, "grad_norm": 2.75, "learning_rate": 0.00013357453261472258, "loss": 0.10372325897216797, "mean_token_accuracy": 0.9723598077893257, "num_tokens": 14698635.0, "step": 13925 }, { "entropy": 0.07735523965937319, "epoch": 1.2301638043166736, "grad_norm": 2.09375, "learning_rate": 0.0001333502466956774, "loss": 0.10036856651306153, "mean_token_accuracy": 0.9775028115510941, "num_tokens": 14723411.0, "step": 13950 }, { "entropy": 0.07590767514484469, "epoch": 1.2323684384576379, "grad_norm": 2.046875, "learning_rate": 0.00013312577185701802, "loss": 0.09764260292053223, "mean_token_accuracy": 0.9767592492699623, "num_tokens": 14750384.0, "step": 13975 }, { "entropy": 0.07375224770570639, "epoch": 1.2345730725986024, "grad_norm": 1.3515625, "learning_rate": 0.00013290110937033022, "loss": 0.10770444869995117, "mean_token_accuracy": 0.9764727628231049, "num_tokens": 14778089.0, "step": 14000 }, { "epoch": 1.2345730725986024, "eval_entropy": 0.03317551793803443, "eval_loss": 0.042815063148736954, "eval_mean_token_accuracy": 0.9874204469964075, "eval_num_tokens": 14778089.0, "eval_runtime": 249.1251, "eval_samples_per_second": 15.771, "eval_steps_per_second": 3.946, "step": 14000 }, { "entropy": 0.08082759477692888, "epoch": 1.2367777067395667, "grad_norm": 3.21875, "learning_rate": 0.00013267626050826275, "loss": 0.1340487289428711, "mean_token_accuracy": 0.9732246950268746, "num_tokens": 14806606.0, "step": 14025 }, { "entropy": 0.06270176315927528, "epoch": 1.238982340880531, "grad_norm": 1.8515625, "learning_rate": 0.0001324512265445202, "loss": 0.0792490816116333, "mean_token_accuracy": 0.9802202478051185, "num_tokens": 14832296.0, "step": 14050 }, { "entropy": 0.06344090539118043, "epoch": 1.2411869750214952, "grad_norm": 1.6484375, "learning_rate": 0.0001322260087538556, "loss": 0.07540414333343506, "mean_token_accuracy": 0.9804856219887733, "num_tokens": 14857948.0, "step": 14075 }, { "entropy": 0.07318885205080733, "epoch": 1.2433916091624595, "grad_norm": 3.0625, "learning_rate": 0.00013200060841206336, "loss": 0.10172588348388673, "mean_token_accuracy": 0.9763992458581925, "num_tokens": 14885010.0, "step": 14100 }, { "entropy": 0.09100308601686265, "epoch": 1.2455962433034238, "grad_norm": 0.64453125, "learning_rate": 0.00013177502679597198, "loss": 0.11631698608398437, "mean_token_accuracy": 0.9720056369900704, "num_tokens": 14911651.0, "step": 14125 }, { "entropy": 0.06297349792752356, "epoch": 1.247800877444388, "grad_norm": 4.34375, "learning_rate": 0.00013154926518343685, "loss": 0.08014249801635742, "mean_token_accuracy": 0.9779363590478897, "num_tokens": 14937307.0, "step": 14150 }, { "entropy": 0.06706018248645705, "epoch": 1.2500055115853523, "grad_norm": 3.5, "learning_rate": 0.00013132332485333293, "loss": 0.09858132362365722, "mean_token_accuracy": 0.9780547374486923, "num_tokens": 14963286.0, "step": 14175 }, { "entropy": 0.06707234902569326, "epoch": 1.2522101457263166, "grad_norm": 3.8125, "learning_rate": 0.0001310972070855477, "loss": 0.09439393043518067, "mean_token_accuracy": 0.9792554202675819, "num_tokens": 14988574.0, "step": 14200 }, { "entropy": 0.055138559531333155, "epoch": 1.2544147798672811, "grad_norm": 2.0625, "learning_rate": 0.00013087091316097357, "loss": 0.07935715675354003, "mean_token_accuracy": 0.9827534911036492, "num_tokens": 15014049.0, "step": 14225 }, { "entropy": 0.05755839720135555, "epoch": 1.2566194140082454, "grad_norm": 0.70703125, "learning_rate": 0.000130644444361501, "loss": 0.08224278450012207, "mean_token_accuracy": 0.9831167414784432, "num_tokens": 15040415.0, "step": 14250 }, { "entropy": 0.07995560545474291, "epoch": 1.2588240481492097, "grad_norm": 1.3671875, "learning_rate": 0.00013041780197001096, "loss": 0.10370305061340332, "mean_token_accuracy": 0.9753643274307251, "num_tokens": 15068204.0, "step": 14275 }, { "entropy": 0.05787755233614007, "epoch": 1.261028682290174, "grad_norm": 1.328125, "learning_rate": 0.00013019098727036783, "loss": 0.07909601211547851, "mean_token_accuracy": 0.9826915863156319, "num_tokens": 15094396.0, "step": 14300 }, { "entropy": 0.07108791207894682, "epoch": 1.2632333164311382, "grad_norm": 3.328125, "learning_rate": 0.00012996400154741206, "loss": 0.11207365036010743, "mean_token_accuracy": 0.9777934941649437, "num_tokens": 15120265.0, "step": 14325 }, { "entropy": 0.07744198189320742, "epoch": 1.2654379505721025, "grad_norm": 3.78125, "learning_rate": 0.0001297368460869529, "loss": 0.0994594669342041, "mean_token_accuracy": 0.9777314651012421, "num_tokens": 15147541.0, "step": 14350 }, { "entropy": 0.0713876857640571, "epoch": 1.2676425847130668, "grad_norm": 1.796875, "learning_rate": 0.000129509522175761, "loss": 0.1327120590209961, "mean_token_accuracy": 0.9768693792819977, "num_tokens": 15174152.0, "step": 14375 }, { "entropy": 0.07512093096884201, "epoch": 1.2698472188540313, "grad_norm": 3.40625, "learning_rate": 0.00012928203110156147, "loss": 0.0983432388305664, "mean_token_accuracy": 0.9782489436864853, "num_tokens": 15199844.0, "step": 14400 }, { "entropy": 0.07445235479419353, "epoch": 1.2720518529949953, "grad_norm": 0.53515625, "learning_rate": 0.00012905437415302614, "loss": 0.10376456260681152, "mean_token_accuracy": 0.9784587541222572, "num_tokens": 15226788.0, "step": 14425 }, { "entropy": 0.07068355955168953, "epoch": 1.2742564871359598, "grad_norm": 3.203125, "learning_rate": 0.00012882655261976656, "loss": 0.09820618629455566, "mean_token_accuracy": 0.9778537949919701, "num_tokens": 15252536.0, "step": 14450 }, { "entropy": 0.06282317588367732, "epoch": 1.2764611212769241, "grad_norm": 3.203125, "learning_rate": 0.00012859856779232656, "loss": 0.09604778289794921, "mean_token_accuracy": 0.9794092005491257, "num_tokens": 15278867.0, "step": 14475 }, { "entropy": 0.06273727558698738, "epoch": 1.2786657554178884, "grad_norm": 0.7421875, "learning_rate": 0.00012837042096217513, "loss": 0.08490729331970215, "mean_token_accuracy": 0.9807071474194526, "num_tokens": 15305157.0, "step": 14500 }, { "entropy": 0.07701689342531609, "epoch": 1.2808703895588527, "grad_norm": 1.6015625, "learning_rate": 0.00012814211342169872, "loss": 0.10085676193237304, "mean_token_accuracy": 0.9763132336735726, "num_tokens": 15332404.0, "step": 14525 }, { "entropy": 0.08465464082692051, "epoch": 1.283075023699817, "grad_norm": 2.328125, "learning_rate": 0.00012791364646419436, "loss": 0.116805419921875, "mean_token_accuracy": 0.974503707587719, "num_tokens": 15361644.0, "step": 14550 }, { "entropy": 0.08087968374718912, "epoch": 1.2852796578407812, "grad_norm": 1.4140625, "learning_rate": 0.00012768502138386196, "loss": 0.11668486595153808, "mean_token_accuracy": 0.9748384010791779, "num_tokens": 15388581.0, "step": 14575 }, { "entropy": 0.06501860655174824, "epoch": 1.2874842919817455, "grad_norm": 3.765625, "learning_rate": 0.00012745623947579737, "loss": 0.08816573143005371, "mean_token_accuracy": 0.9800462105870247, "num_tokens": 15414389.0, "step": 14600 }, { "entropy": 0.05114416478230851, "epoch": 1.28968892612271, "grad_norm": 3.09375, "learning_rate": 0.0001272273020359846, "loss": 0.07702481269836425, "mean_token_accuracy": 0.9823195374011994, "num_tokens": 15439393.0, "step": 14625 }, { "entropy": 0.06628791322320467, "epoch": 1.2918935602636743, "grad_norm": 1.625, "learning_rate": 0.0001269982103612889, "loss": 0.08344200134277344, "mean_token_accuracy": 0.9795394757390022, "num_tokens": 15464910.0, "step": 14650 }, { "entropy": 0.06431539795725257, "epoch": 1.2940981944046386, "grad_norm": 2.421875, "learning_rate": 0.00012676896574944903, "loss": 0.07873389720916749, "mean_token_accuracy": 0.9806241154670715, "num_tokens": 15490634.0, "step": 14675 }, { "entropy": 0.07032535757040023, "epoch": 1.2963028285456029, "grad_norm": 1.734375, "learning_rate": 0.00012653956949907027, "loss": 0.1325330352783203, "mean_token_accuracy": 0.9765818390250206, "num_tokens": 15517211.0, "step": 14700 }, { "entropy": 0.07646813986793859, "epoch": 1.2985074626865671, "grad_norm": 0.421875, "learning_rate": 0.0001263100229096167, "loss": 0.10172966003417969, "mean_token_accuracy": 0.9765880072116851, "num_tokens": 15543427.0, "step": 14725 }, { "entropy": 0.07018844754958992, "epoch": 1.3007120968275314, "grad_norm": 6.0625, "learning_rate": 0.00012608032728140422, "loss": 0.09067879676818848, "mean_token_accuracy": 0.9805409485101699, "num_tokens": 15569803.0, "step": 14750 }, { "entropy": 0.07123511103447527, "epoch": 1.3029167309684957, "grad_norm": 3.59375, "learning_rate": 0.0001258504839155929, "loss": 0.09630011558532715, "mean_token_accuracy": 0.9766310065984726, "num_tokens": 15597403.0, "step": 14775 }, { "entropy": 0.07258567026554374, "epoch": 1.3051213651094602, "grad_norm": 2.625, "learning_rate": 0.00012562049411417965, "loss": 0.10658187866210937, "mean_token_accuracy": 0.9771447145938873, "num_tokens": 15624262.0, "step": 14800 }, { "entropy": 0.07019207466975785, "epoch": 1.3073259992504245, "grad_norm": 3.671875, "learning_rate": 0.00012539035917999097, "loss": 0.09414088249206543, "mean_token_accuracy": 0.976831995844841, "num_tokens": 15651296.0, "step": 14825 }, { "entropy": 0.057612950848124456, "epoch": 1.3095306333913888, "grad_norm": 2.421875, "learning_rate": 0.0001251600804166755, "loss": 0.08104521751403809, "mean_token_accuracy": 0.9797638493776322, "num_tokens": 15676634.0, "step": 14850 }, { "entropy": 0.08526865348889259, "epoch": 1.311735267532353, "grad_norm": 2.546875, "learning_rate": 0.00012492965912869658, "loss": 0.10663084983825684, "mean_token_accuracy": 0.9760200345516205, "num_tokens": 15704426.0, "step": 14875 }, { "entropy": 0.07147054383560317, "epoch": 1.3139399016733173, "grad_norm": 2.109375, "learning_rate": 0.00012469909662132496, "loss": 0.09058443069458008, "mean_token_accuracy": 0.9777011832594872, "num_tokens": 15731370.0, "step": 14900 }, { "entropy": 0.05473189148120582, "epoch": 1.3161445358142816, "grad_norm": 0.6953125, "learning_rate": 0.00012446839420063137, "loss": 0.07786131858825683, "mean_token_accuracy": 0.9817092132568359, "num_tokens": 15757600.0, "step": 14925 }, { "entropy": 0.07791225100867451, "epoch": 1.3183491699552459, "grad_norm": 2.421875, "learning_rate": 0.000124237553173479, "loss": 0.14551064491271973, "mean_token_accuracy": 0.9755642533302307, "num_tokens": 15785308.0, "step": 14950 }, { "entropy": 0.07555221020767931, "epoch": 1.3205538040962104, "grad_norm": 0.84375, "learning_rate": 0.00012400657484751634, "loss": 0.08915863037109376, "mean_token_accuracy": 0.9793051525950431, "num_tokens": 15811726.0, "step": 14975 }, { "entropy": 0.06056748362811049, "epoch": 1.3227584382371744, "grad_norm": 1.5859375, "learning_rate": 0.00012377546053116958, "loss": 0.09584538459777832, "mean_token_accuracy": 0.9806088766455651, "num_tokens": 15837688.0, "step": 15000 }, { "epoch": 1.3227584382371744, "eval_entropy": 0.03326694843892694, "eval_loss": 0.04031017795205116, "eval_mean_token_accuracy": 0.9879487514010766, "eval_num_tokens": 15837688.0, "eval_runtime": 248.7592, "eval_samples_per_second": 15.794, "eval_steps_per_second": 3.952, "step": 15000 }, { "entropy": 0.06415669579728274, "epoch": 1.324963072378139, "grad_norm": 2.859375, "learning_rate": 0.0001235442115336352, "loss": 0.08943920135498047, "mean_token_accuracy": 0.9794209009408951, "num_tokens": 15863738.0, "step": 15025 }, { "entropy": 0.08142153425927973, "epoch": 1.3271677065191032, "grad_norm": 2.140625, "learning_rate": 0.0001233128291648727, "loss": 0.11874267578125, "mean_token_accuracy": 0.9757076546549797, "num_tokens": 15890894.0, "step": 15050 }, { "entropy": 0.0783909736975329, "epoch": 1.3293723406600675, "grad_norm": 2.0625, "learning_rate": 0.0001230813147355971, "loss": 0.11000186920166016, "mean_token_accuracy": 0.9764264470338821, "num_tokens": 15918938.0, "step": 15075 }, { "entropy": 0.06577202208303788, "epoch": 1.3315769748010318, "grad_norm": 0.9453125, "learning_rate": 0.0001228496695572714, "loss": 0.10996245384216309, "mean_token_accuracy": 0.9817772251367569, "num_tokens": 15946088.0, "step": 15100 }, { "entropy": 0.06237481433723588, "epoch": 1.333781608941996, "grad_norm": 0.8125, "learning_rate": 0.00012261789494209937, "loss": 0.08556965827941894, "mean_token_accuracy": 0.977502366900444, "num_tokens": 15971944.0, "step": 15125 }, { "entropy": 0.06582288480800344, "epoch": 1.3359862430829603, "grad_norm": 2.5625, "learning_rate": 0.00012238599220301788, "loss": 0.09093325614929199, "mean_token_accuracy": 0.9768818697333336, "num_tokens": 15997233.0, "step": 15150 }, { "entropy": 0.065389038942958, "epoch": 1.3381908772239246, "grad_norm": 0.80078125, "learning_rate": 0.00012215396265368973, "loss": 0.09562211036682129, "mean_token_accuracy": 0.9802220293879509, "num_tokens": 16024740.0, "step": 15175 }, { "entropy": 0.08525684173408081, "epoch": 1.340395511364889, "grad_norm": 2.921875, "learning_rate": 0.00012192180760849595, "loss": 0.1441513729095459, "mean_token_accuracy": 0.970992026925087, "num_tokens": 16051698.0, "step": 15200 }, { "entropy": 0.06295717555127339, "epoch": 1.3426001455058534, "grad_norm": 4.5625, "learning_rate": 0.00012168952838252853, "loss": 0.0884068489074707, "mean_token_accuracy": 0.9819012552499771, "num_tokens": 16077703.0, "step": 15225 }, { "entropy": 0.060524827540357366, "epoch": 1.3448047796468177, "grad_norm": 2.5, "learning_rate": 0.00012145712629158286, "loss": 0.08740668296813965, "mean_token_accuracy": 0.9805832356214523, "num_tokens": 16104630.0, "step": 15250 }, { "entropy": 0.07011599280362134, "epoch": 1.347009413787782, "grad_norm": 2.203125, "learning_rate": 0.00012122460265215038, "loss": 0.10101802825927735, "mean_token_accuracy": 0.9771647998690605, "num_tokens": 16130521.0, "step": 15275 }, { "entropy": 0.05694206524029141, "epoch": 1.3492140479287462, "grad_norm": 2.640625, "learning_rate": 0.000120991958781411, "loss": 0.07328418731689453, "mean_token_accuracy": 0.9819918230175972, "num_tokens": 16155447.0, "step": 15300 }, { "entropy": 0.05970909093797672, "epoch": 1.3514186820697105, "grad_norm": 2.96875, "learning_rate": 0.00012075919599722583, "loss": 0.08711193084716796, "mean_token_accuracy": 0.9814882269501686, "num_tokens": 16181215.0, "step": 15325 }, { "entropy": 0.08422257184254704, "epoch": 1.3536233162106748, "grad_norm": 0.48828125, "learning_rate": 0.00012052631561812941, "loss": 0.1124593448638916, "mean_token_accuracy": 0.9755303618311882, "num_tokens": 16208701.0, "step": 15350 }, { "entropy": 0.06997806068888167, "epoch": 1.3558279503516393, "grad_norm": 0.7578125, "learning_rate": 0.00012029331896332259, "loss": 0.08874547958374024, "mean_token_accuracy": 0.9804951578378678, "num_tokens": 16235817.0, "step": 15375 }, { "entropy": 0.06830648218063289, "epoch": 1.3580325844926033, "grad_norm": 0.84765625, "learning_rate": 0.00012006020735266474, "loss": 0.1033985424041748, "mean_token_accuracy": 0.9769791024923324, "num_tokens": 16262506.0, "step": 15400 }, { "entropy": 0.05717379552341299, "epoch": 1.3602372186335678, "grad_norm": 3.53125, "learning_rate": 0.00011982698210666657, "loss": 0.08480052947998047, "mean_token_accuracy": 0.9808929657936096, "num_tokens": 16288669.0, "step": 15425 }, { "entropy": 0.07715147120325128, "epoch": 1.3624418527745321, "grad_norm": 2.171875, "learning_rate": 0.00011959364454648238, "loss": 0.11012930870056152, "mean_token_accuracy": 0.9763377743959427, "num_tokens": 16314669.0, "step": 15450 }, { "entropy": 0.06519403403050092, "epoch": 1.3646464869154964, "grad_norm": 2.21875, "learning_rate": 0.0001193601959939028, "loss": 0.07466179847717286, "mean_token_accuracy": 0.9815621060132981, "num_tokens": 16340364.0, "step": 15475 }, { "entropy": 0.06409725191537291, "epoch": 1.3668511210564607, "grad_norm": 2.46875, "learning_rate": 0.00011912663777134707, "loss": 0.10391844749450684, "mean_token_accuracy": 0.9776857647299767, "num_tokens": 16367170.0, "step": 15500 }, { "entropy": 0.06850099113042234, "epoch": 1.369055755197425, "grad_norm": 1.578125, "learning_rate": 0.00011889297120185585, "loss": 0.10006083488464355, "mean_token_accuracy": 0.9765678381919861, "num_tokens": 16393766.0, "step": 15525 }, { "entropy": 0.06729476221749792, "epoch": 1.3712603893383892, "grad_norm": 0.75390625, "learning_rate": 0.0001186591976090834, "loss": 0.0871645450592041, "mean_token_accuracy": 0.9782984137535096, "num_tokens": 16420859.0, "step": 15550 }, { "entropy": 0.06440466578496853, "epoch": 1.3734650234793535, "grad_norm": 2.734375, "learning_rate": 0.00011842531831729031, "loss": 0.07289909362792969, "mean_token_accuracy": 0.9761216223239899, "num_tokens": 16446598.0, "step": 15575 }, { "entropy": 0.052396752069835205, "epoch": 1.375669657620318, "grad_norm": 1.578125, "learning_rate": 0.00011819133465133592, "loss": 0.07493824005126953, "mean_token_accuracy": 0.9828401359915734, "num_tokens": 16473781.0, "step": 15600 }, { "entropy": 0.05889642575762991, "epoch": 1.3778742917612823, "grad_norm": 2.796875, "learning_rate": 0.0001179572479366708, "loss": 0.08633296966552734, "mean_token_accuracy": 0.9812161940336227, "num_tokens": 16500150.0, "step": 15625 }, { "entropy": 0.07454636727910838, "epoch": 1.3800789259022466, "grad_norm": 1.8359375, "learning_rate": 0.00011772305949932928, "loss": 0.1060667610168457, "mean_token_accuracy": 0.9713787686824799, "num_tokens": 16527605.0, "step": 15650 }, { "entropy": 0.05934964165790006, "epoch": 1.3822835600432108, "grad_norm": 1.71875, "learning_rate": 0.00011748877066592192, "loss": 0.08717299461364746, "mean_token_accuracy": 0.9823898765444755, "num_tokens": 16554331.0, "step": 15675 }, { "entropy": 0.06696738389495295, "epoch": 1.3844881941841751, "grad_norm": 0.6796875, "learning_rate": 0.00011725438276362799, "loss": 0.09112286567687988, "mean_token_accuracy": 0.9790424865484237, "num_tokens": 16580564.0, "step": 15700 }, { "entropy": 0.06791757563012651, "epoch": 1.3866928283251394, "grad_norm": 1.4140625, "learning_rate": 0.00011701989712018798, "loss": 0.09548748970031738, "mean_token_accuracy": 0.9753042080998421, "num_tokens": 16606721.0, "step": 15725 }, { "entropy": 0.060772258413926465, "epoch": 1.3888974624661037, "grad_norm": 2.140625, "learning_rate": 0.00011678531506389594, "loss": 0.08734206199645995, "mean_token_accuracy": 0.9813879826664924, "num_tokens": 16632865.0, "step": 15750 }, { "entropy": 0.08063680976076285, "epoch": 1.3911020966070682, "grad_norm": 1.359375, "learning_rate": 0.00011655063792359226, "loss": 0.11765849113464355, "mean_token_accuracy": 0.9731235000491142, "num_tokens": 16660928.0, "step": 15775 }, { "entropy": 0.06548881910362979, "epoch": 1.3933067307480322, "grad_norm": 3.515625, "learning_rate": 0.00011631586702865582, "loss": 0.08853742599487305, "mean_token_accuracy": 0.982549340724945, "num_tokens": 16686061.0, "step": 15800 }, { "entropy": 0.05701154603782925, "epoch": 1.3955113648889967, "grad_norm": 3.015625, "learning_rate": 0.00011608100370899664, "loss": 0.08760712623596191, "mean_token_accuracy": 0.9825446775555611, "num_tokens": 16712645.0, "step": 15825 }, { "entropy": 0.07111379386769841, "epoch": 1.397715999029961, "grad_norm": 3.171875, "learning_rate": 0.00011584604929504823, "loss": 0.10779889106750488, "mean_token_accuracy": 0.9773144674301147, "num_tokens": 16739270.0, "step": 15850 }, { "entropy": 0.06202032042929204, "epoch": 1.3999206331709253, "grad_norm": 1.1796875, "learning_rate": 0.00011561100511776026, "loss": 0.06711362838745118, "mean_token_accuracy": 0.9842124137282372, "num_tokens": 16766150.0, "step": 15875 }, { "entropy": 0.05521934061674983, "epoch": 1.4021252673118896, "grad_norm": 1.65625, "learning_rate": 0.00011537587250859081, "loss": 0.07264583587646484, "mean_token_accuracy": 0.9817525637149811, "num_tokens": 16791967.0, "step": 15900 }, { "entropy": 0.06004962370701833, "epoch": 1.4043299014528539, "grad_norm": 2.40625, "learning_rate": 0.00011514065279949882, "loss": 0.09220392227172852, "mean_token_accuracy": 0.979226841032505, "num_tokens": 16818341.0, "step": 15925 }, { "entropy": 0.06723880005811225, "epoch": 1.4065345355938181, "grad_norm": 2.03125, "learning_rate": 0.00011490534732293677, "loss": 0.0945945930480957, "mean_token_accuracy": 0.9777571973204613, "num_tokens": 16844714.0, "step": 15950 }, { "entropy": 0.06054009918632801, "epoch": 1.4087391697347824, "grad_norm": 1.859375, "learning_rate": 0.00011466995741184288, "loss": 0.08592336654663085, "mean_token_accuracy": 0.9799673467874527, "num_tokens": 16870803.0, "step": 15975 }, { "entropy": 0.061580221586773405, "epoch": 1.410943803875747, "grad_norm": 0.83984375, "learning_rate": 0.00011443448439963374, "loss": 0.07537917137145995, "mean_token_accuracy": 0.9815564286708832, "num_tokens": 16896056.0, "step": 16000 }, { "epoch": 1.410943803875747, "eval_entropy": 0.03029697120738228, "eval_loss": 0.035930126905441284, "eval_mean_token_accuracy": 0.9891901263627977, "eval_num_tokens": 16896056.0, "eval_runtime": 248.7891, "eval_samples_per_second": 15.792, "eval_steps_per_second": 3.951, "step": 16000 }, { "entropy": 0.07580967456306098, "epoch": 1.4131484380167112, "grad_norm": 3.484375, "learning_rate": 0.00011419892962019665, "loss": 0.11364545822143554, "mean_token_accuracy": 0.9733854481577873, "num_tokens": 16924947.0, "step": 16025 }, { "entropy": 0.05546699926489964, "epoch": 1.4153530721576755, "grad_norm": 1.3125, "learning_rate": 0.00011396329440788207, "loss": 0.08760024070739746, "mean_token_accuracy": 0.981015156507492, "num_tokens": 16951077.0, "step": 16050 }, { "entropy": 0.08654316918458789, "epoch": 1.4175577062986398, "grad_norm": 2.296875, "learning_rate": 0.00011372758009749615, "loss": 0.1186264705657959, "mean_token_accuracy": 0.974712208211422, "num_tokens": 16979577.0, "step": 16075 }, { "entropy": 0.053080160124518445, "epoch": 1.419762340439604, "grad_norm": 0.96875, "learning_rate": 0.00011349178802429308, "loss": 0.07206877708435058, "mean_token_accuracy": 0.9819573852419853, "num_tokens": 17005306.0, "step": 16100 }, { "entropy": 0.06058032729837578, "epoch": 1.4219669745805683, "grad_norm": 0.93359375, "learning_rate": 0.00011325591952396755, "loss": 0.08690043449401856, "mean_token_accuracy": 0.980279144346714, "num_tokens": 17030374.0, "step": 16125 }, { "entropy": 0.06873179055764922, "epoch": 1.4241716087215326, "grad_norm": 2.4375, "learning_rate": 0.00011301997593264717, "loss": 0.1052779483795166, "mean_token_accuracy": 0.9760980147123337, "num_tokens": 17058470.0, "step": 16150 }, { "entropy": 0.06835755564185092, "epoch": 1.426376242862497, "grad_norm": 3.25, "learning_rate": 0.00011278395858688493, "loss": 0.09940330505371094, "mean_token_accuracy": 0.9808136349916459, "num_tokens": 17084609.0, "step": 16175 }, { "entropy": 0.05775143167622446, "epoch": 1.4285808770034611, "grad_norm": 1.21875, "learning_rate": 0.00011254786882365169, "loss": 0.07310149669647217, "mean_token_accuracy": 0.9825145149230957, "num_tokens": 17110272.0, "step": 16200 }, { "entropy": 0.05565455894873594, "epoch": 1.4307855111444256, "grad_norm": 1.4609375, "learning_rate": 0.00011231170798032839, "loss": 0.07367197513580322, "mean_token_accuracy": 0.9814733082056045, "num_tokens": 17137287.0, "step": 16225 }, { "entropy": 0.06046327952935826, "epoch": 1.43299014528539, "grad_norm": 1.1015625, "learning_rate": 0.00011207547739469882, "loss": 0.08034516334533691, "mean_token_accuracy": 0.980512827038765, "num_tokens": 17164148.0, "step": 16250 }, { "entropy": 0.07160910110978876, "epoch": 1.4351947794263542, "grad_norm": 2.578125, "learning_rate": 0.00011183917840494156, "loss": 0.09737605094909668, "mean_token_accuracy": 0.9788093277812004, "num_tokens": 17190982.0, "step": 16275 }, { "entropy": 0.06516157711521373, "epoch": 1.4373994135673185, "grad_norm": 2.40625, "learning_rate": 0.00011160281234962296, "loss": 0.0906210994720459, "mean_token_accuracy": 0.979938026368618, "num_tokens": 17217765.0, "step": 16300 }, { "entropy": 0.06518849867788959, "epoch": 1.4396040477082828, "grad_norm": 3.15625, "learning_rate": 0.00011136638056768909, "loss": 0.09936015129089355, "mean_token_accuracy": 0.981353671848774, "num_tokens": 17244375.0, "step": 16325 }, { "entropy": 0.06567524242636864, "epoch": 1.441808681849247, "grad_norm": 2.796875, "learning_rate": 0.00011112988439845847, "loss": 0.08795083999633789, "mean_token_accuracy": 0.9803341096639633, "num_tokens": 17271304.0, "step": 16350 }, { "entropy": 0.06841813209190149, "epoch": 1.4440133159902113, "grad_norm": 2.328125, "learning_rate": 0.00011089332518161424, "loss": 0.09773797035217285, "mean_token_accuracy": 0.9775998306274414, "num_tokens": 17298464.0, "step": 16375 }, { "entropy": 0.049831424697695186, "epoch": 1.4462179501311758, "grad_norm": 2.109375, "learning_rate": 0.00011065670425719677, "loss": 0.06948981761932373, "mean_token_accuracy": 0.9838525611162185, "num_tokens": 17324374.0, "step": 16400 }, { "entropy": 0.046832082935143265, "epoch": 1.44842258427214, "grad_norm": 0.94140625, "learning_rate": 0.00011042002296559593, "loss": 0.07201289176940918, "mean_token_accuracy": 0.9847058826684951, "num_tokens": 17350259.0, "step": 16425 }, { "entropy": 0.07186193967063445, "epoch": 1.4506272184131044, "grad_norm": 3.09375, "learning_rate": 0.00011018328264754363, "loss": 0.09589914321899413, "mean_token_accuracy": 0.9776010760664939, "num_tokens": 17377041.0, "step": 16450 }, { "entropy": 0.05535520975492545, "epoch": 1.4528318525540687, "grad_norm": 2.6875, "learning_rate": 0.00010994648464410606, "loss": 0.06740634441375733, "mean_token_accuracy": 0.9842914417386055, "num_tokens": 17402825.0, "step": 16475 }, { "entropy": 0.05898763036486344, "epoch": 1.455036486695033, "grad_norm": 2.875, "learning_rate": 0.00010970963029667625, "loss": 0.08764254570007324, "mean_token_accuracy": 0.9801486966013908, "num_tokens": 17430285.0, "step": 16500 }, { "entropy": 0.062050230994209414, "epoch": 1.4572411208359972, "grad_norm": 4.34375, "learning_rate": 0.00010947272094696632, "loss": 0.0902223014831543, "mean_token_accuracy": 0.9799307104945183, "num_tokens": 17455617.0, "step": 16525 }, { "entropy": 0.06111173116034479, "epoch": 1.4594457549769615, "grad_norm": 5.21875, "learning_rate": 0.00010923575793700008, "loss": 0.0825214958190918, "mean_token_accuracy": 0.9789560279250145, "num_tokens": 17481328.0, "step": 16550 }, { "entropy": 0.0673610715954419, "epoch": 1.461650389117926, "grad_norm": 2.484375, "learning_rate": 0.00010899874260910517, "loss": 0.09007739067077637, "mean_token_accuracy": 0.9788965311646461, "num_tokens": 17508388.0, "step": 16575 }, { "entropy": 0.06072841239583795, "epoch": 1.46385502325889, "grad_norm": 1.625, "learning_rate": 0.00010876167630590577, "loss": 0.08078091621398925, "mean_token_accuracy": 0.9796930846571922, "num_tokens": 17535426.0, "step": 16600 }, { "entropy": 0.0715254348909366, "epoch": 1.4660596573998546, "grad_norm": 0.7578125, "learning_rate": 0.00010852456037031462, "loss": 0.09795802116394042, "mean_token_accuracy": 0.9788025477528572, "num_tokens": 17562233.0, "step": 16625 }, { "entropy": 0.07394725729187485, "epoch": 1.4682642915408188, "grad_norm": 2.0, "learning_rate": 0.00010828739614552577, "loss": 0.10762414932250977, "mean_token_accuracy": 0.9766439932584763, "num_tokens": 17589739.0, "step": 16650 }, { "entropy": 0.057726316754124124, "epoch": 1.4704689256817831, "grad_norm": 1.2421875, "learning_rate": 0.00010805018497500674, "loss": 0.07878723621368408, "mean_token_accuracy": 0.9822485408186913, "num_tokens": 17615710.0, "step": 16675 }, { "entropy": 0.050235752706357745, "epoch": 1.4726735598227474, "grad_norm": 2.6875, "learning_rate": 0.000107812928202491, "loss": 0.06715853214263916, "mean_token_accuracy": 0.9836687427759171, "num_tokens": 17641492.0, "step": 16700 }, { "entropy": 0.060145079759095096, "epoch": 1.4748781939637117, "grad_norm": 4.40625, "learning_rate": 0.00010757562717197039, "loss": 0.102103910446167, "mean_token_accuracy": 0.9777582693099975, "num_tokens": 17668109.0, "step": 16725 }, { "entropy": 0.0603743210065295, "epoch": 1.477082828104676, "grad_norm": 3.671875, "learning_rate": 0.00010733828322768738, "loss": 0.079124174118042, "mean_token_accuracy": 0.9801980945467949, "num_tokens": 17694064.0, "step": 16750 }, { "entropy": 0.0633892720109725, "epoch": 1.4792874622456402, "grad_norm": 2.03125, "learning_rate": 0.00010710089771412752, "loss": 0.08133198738098145, "mean_token_accuracy": 0.9809553810954094, "num_tokens": 17720061.0, "step": 16775 }, { "entropy": 0.06459446613931505, "epoch": 1.4814920963866047, "grad_norm": 2.640625, "learning_rate": 0.00010686347197601197, "loss": 0.08791131019592285, "mean_token_accuracy": 0.9798620289564133, "num_tokens": 17747327.0, "step": 16800 }, { "entropy": 0.05424744870688301, "epoch": 1.483696730527569, "grad_norm": 1.0390625, "learning_rate": 0.00010662600735828963, "loss": 0.08159908294677734, "mean_token_accuracy": 0.9822318425774574, "num_tokens": 17773709.0, "step": 16825 }, { "entropy": 0.05502179280680139, "epoch": 1.4859013646685333, "grad_norm": 1.125, "learning_rate": 0.00010638850520612967, "loss": 0.06816422939300537, "mean_token_accuracy": 0.9826434323191643, "num_tokens": 17798111.0, "step": 16850 }, { "entropy": 0.05482125427632127, "epoch": 1.4881059988094976, "grad_norm": 0.9921875, "learning_rate": 0.00010615096686491387, "loss": 0.07486906051635742, "mean_token_accuracy": 0.9844721156358719, "num_tokens": 17823621.0, "step": 16875 }, { "entropy": 0.06038864826390636, "epoch": 1.4903106329504618, "grad_norm": 0.50390625, "learning_rate": 0.0001059133936802291, "loss": 0.06850027084350586, "mean_token_accuracy": 0.9790200263261795, "num_tokens": 17851059.0, "step": 16900 }, { "entropy": 0.05871053482478601, "epoch": 1.4925152670914261, "grad_norm": 2.46875, "learning_rate": 0.00010567578699785953, "loss": 0.0959819221496582, "mean_token_accuracy": 0.9798403069376945, "num_tokens": 17877902.0, "step": 16925 }, { "entropy": 0.055686095813143766, "epoch": 1.4947199012323904, "grad_norm": 0.59765625, "learning_rate": 0.00010543814816377902, "loss": 0.07089345932006835, "mean_token_accuracy": 0.9839478823542595, "num_tokens": 17904792.0, "step": 16950 }, { "entropy": 0.05614824989432236, "epoch": 1.496924535373355, "grad_norm": 1.5234375, "learning_rate": 0.00010520047852414371, "loss": 0.07994057178497314, "mean_token_accuracy": 0.9789690652489662, "num_tokens": 17930852.0, "step": 16975 }, { "entropy": 0.05661927604916855, "epoch": 1.4991291695143192, "grad_norm": 0.89453125, "learning_rate": 0.00010496277942528412, "loss": 0.0782936954498291, "mean_token_accuracy": 0.9812395250797272, "num_tokens": 17958147.0, "step": 17000 }, { "epoch": 1.4991291695143192, "eval_entropy": 0.028475278459578403, "eval_loss": 0.032593537122011185, "eval_mean_token_accuracy": 0.989732926459424, "eval_num_tokens": 17958147.0, "eval_runtime": 248.1012, "eval_samples_per_second": 15.836, "eval_steps_per_second": 3.962, "step": 17000 }, { "entropy": 0.056141581527772356, "epoch": 1.5013338036552835, "grad_norm": 4.15625, "learning_rate": 0.00010472505221369773, "loss": 0.06809127807617188, "mean_token_accuracy": 0.9798111498355866, "num_tokens": 17984383.0, "step": 17025 }, { "entropy": 0.06520437844737899, "epoch": 1.5035384377962477, "grad_norm": 2.734375, "learning_rate": 0.00010448729823604124, "loss": 0.10084832191467286, "mean_token_accuracy": 0.980366622209549, "num_tokens": 18012325.0, "step": 17050 }, { "entropy": 0.06314173155333265, "epoch": 1.505743071937212, "grad_norm": 3.140625, "learning_rate": 0.00010424951883912295, "loss": 0.07563630580902099, "mean_token_accuracy": 0.9817772355675697, "num_tokens": 18039522.0, "step": 17075 }, { "entropy": 0.047563048835654625, "epoch": 1.5079477060781763, "grad_norm": 1.3046875, "learning_rate": 0.00010401171536989517, "loss": 0.05595874309539795, "mean_token_accuracy": 0.9854984751343727, "num_tokens": 18064974.0, "step": 17100 }, { "entropy": 0.053548066731600556, "epoch": 1.5101523402191406, "grad_norm": 1.4296875, "learning_rate": 0.0001037738891754466, "loss": 0.07935151100158691, "mean_token_accuracy": 0.9796605777740478, "num_tokens": 18091851.0, "step": 17125 }, { "entropy": 0.05894872721350111, "epoch": 1.512356974360105, "grad_norm": 0.8984375, "learning_rate": 0.00010353604160299464, "loss": 0.08113452911376953, "mean_token_accuracy": 0.982445887029171, "num_tokens": 18118947.0, "step": 17150 }, { "entropy": 0.05036383649487106, "epoch": 1.5145616085010691, "grad_norm": 0.68359375, "learning_rate": 0.0001032981739998778, "loss": 0.05923966407775879, "mean_token_accuracy": 0.9860278350114823, "num_tokens": 18143717.0, "step": 17175 }, { "entropy": 0.046229548780756885, "epoch": 1.5167662426420336, "grad_norm": 0.373046875, "learning_rate": 0.000103060287713548, "loss": 0.07923832416534424, "mean_token_accuracy": 0.9831987258791923, "num_tokens": 18168349.0, "step": 17200 }, { "entropy": 0.06594804828648193, "epoch": 1.518970876782998, "grad_norm": 3.9375, "learning_rate": 0.00010282238409156315, "loss": 0.11481256484985351, "mean_token_accuracy": 0.9762162980437279, "num_tokens": 18194696.0, "step": 17225 }, { "entropy": 0.06344727827017778, "epoch": 1.5211755109239622, "grad_norm": 0.28125, "learning_rate": 0.00010258446448157917, "loss": 0.08004162788391113, "mean_token_accuracy": 0.9845002761483193, "num_tokens": 18222060.0, "step": 17250 }, { "entropy": 0.07739625903428532, "epoch": 1.5233801450649265, "grad_norm": 0.9375, "learning_rate": 0.00010234653023134276, "loss": 0.11823193550109863, "mean_token_accuracy": 0.9756862100958824, "num_tokens": 18250159.0, "step": 17275 }, { "entropy": 0.048326283237038296, "epoch": 1.5255847792058908, "grad_norm": 2.765625, "learning_rate": 0.00010210858268868328, "loss": 0.05785459041595459, "mean_token_accuracy": 0.9836695104837417, "num_tokens": 18276170.0, "step": 17300 }, { "entropy": 0.055761023027298504, "epoch": 1.5277894133468553, "grad_norm": 2.3125, "learning_rate": 0.00010187062320150564, "loss": 0.07718227386474609, "mean_token_accuracy": 0.9812486064434052, "num_tokens": 18302079.0, "step": 17325 }, { "entropy": 0.06133182929028408, "epoch": 1.5299940474878193, "grad_norm": 1.6484375, "learning_rate": 0.00010163265311778227, "loss": 0.07602161884307862, "mean_token_accuracy": 0.9805529493093491, "num_tokens": 18328582.0, "step": 17350 }, { "entropy": 0.04131406945714843, "epoch": 1.5321986816287838, "grad_norm": 1.421875, "learning_rate": 0.00010139467378554572, "loss": 0.059069275856018066, "mean_token_accuracy": 0.9861931943893433, "num_tokens": 18354126.0, "step": 17375 }, { "entropy": 0.05679673735721735, "epoch": 1.5344033157697479, "grad_norm": 1.953125, "learning_rate": 0.00010115668655288086, "loss": 0.08031403541564941, "mean_token_accuracy": 0.9827021709084511, "num_tokens": 18380922.0, "step": 17400 }, { "entropy": 0.050058743792469614, "epoch": 1.5366079499107124, "grad_norm": 1.3984375, "learning_rate": 0.0001009186927679173, "loss": 0.0634553337097168, "mean_token_accuracy": 0.9828399559855461, "num_tokens": 18406786.0, "step": 17425 }, { "entropy": 0.06921417500561802, "epoch": 1.5388125840516766, "grad_norm": 0.984375, "learning_rate": 0.0001006806937788218, "loss": 0.10088022232055664, "mean_token_accuracy": 0.9784321439266205, "num_tokens": 18434199.0, "step": 17450 }, { "entropy": 0.05730687389936065, "epoch": 1.541017218192641, "grad_norm": 0.96875, "learning_rate": 0.00010044269093379066, "loss": 0.07943611145019532, "mean_token_accuracy": 0.9813975363969802, "num_tokens": 18460028.0, "step": 17475 }, { "entropy": 0.0577721811196534, "epoch": 1.5432218523336052, "grad_norm": 2.71875, "learning_rate": 0.00010020468558104192, "loss": 0.07405065059661865, "mean_token_accuracy": 0.9820497670769691, "num_tokens": 18484935.0, "step": 17500 }, { "entropy": 0.06218616932768782, "epoch": 1.5454264864745695, "grad_norm": 2.15625, "learning_rate": 9.996667906880787e-05, "loss": 0.07520933151245117, "mean_token_accuracy": 0.9795030042529106, "num_tokens": 18509996.0, "step": 17525 }, { "entropy": 0.05698599866198492, "epoch": 1.547631120615534, "grad_norm": 0.7890625, "learning_rate": 9.972867274532739e-05, "loss": 0.0772179651260376, "mean_token_accuracy": 0.9817389845848083, "num_tokens": 18536971.0, "step": 17550 }, { "entropy": 0.05628544877305103, "epoch": 1.549835754756498, "grad_norm": 2.9375, "learning_rate": 9.94906679588382e-05, "loss": 0.07337986946105957, "mean_token_accuracy": 0.9811558586359024, "num_tokens": 18562553.0, "step": 17575 }, { "entropy": 0.05942114258214133, "epoch": 1.5520403888974625, "grad_norm": 1.3984375, "learning_rate": 9.925266605756944e-05, "loss": 0.07937191486358643, "mean_token_accuracy": 0.9792503651976585, "num_tokens": 18588415.0, "step": 17600 }, { "entropy": 0.05903674538480118, "epoch": 1.5542450230384268, "grad_norm": 2.796875, "learning_rate": 9.901466838973386e-05, "loss": 0.0790792989730835, "mean_token_accuracy": 0.9809808561205864, "num_tokens": 18615439.0, "step": 17625 }, { "entropy": 0.047130853843918886, "epoch": 1.556449657179391, "grad_norm": 0.609375, "learning_rate": 9.87766763035202e-05, "loss": 0.058544158935546875, "mean_token_accuracy": 0.9855070438981056, "num_tokens": 18641119.0, "step": 17650 }, { "entropy": 0.05112945525848772, "epoch": 1.5586542913203554, "grad_norm": 1.4921875, "learning_rate": 9.853869114708556e-05, "loss": 0.0614125919342041, "mean_token_accuracy": 0.9811950054764748, "num_tokens": 18666641.0, "step": 17675 }, { "entropy": 0.05161150803469354, "epoch": 1.5608589254613197, "grad_norm": 3.9375, "learning_rate": 9.830071426854784e-05, "loss": 0.06254351615905762, "mean_token_accuracy": 0.9838355273008347, "num_tokens": 18692161.0, "step": 17700 }, { "entropy": 0.058606664875915156, "epoch": 1.5630635596022842, "grad_norm": 1.171875, "learning_rate": 9.806274701597806e-05, "loss": 0.08459006309509277, "mean_token_accuracy": 0.9809410843253136, "num_tokens": 18718428.0, "step": 17725 }, { "entropy": 0.059776942255703035, "epoch": 1.5652681937432482, "grad_norm": 2.390625, "learning_rate": 9.782479073739268e-05, "loss": 0.07257438182830811, "mean_token_accuracy": 0.9825310951471329, "num_tokens": 18745938.0, "step": 17750 }, { "entropy": 0.05679215215088334, "epoch": 1.5674728278842127, "grad_norm": 1.46875, "learning_rate": 9.758684678074594e-05, "loss": 0.07460373878479004, "mean_token_accuracy": 0.9839976826310157, "num_tokens": 18771583.0, "step": 17775 }, { "entropy": 0.06317433909796819, "epoch": 1.5696774620251768, "grad_norm": 2.453125, "learning_rate": 9.734891649392238e-05, "loss": 0.08541607856750488, "mean_token_accuracy": 0.980264983177185, "num_tokens": 18799453.0, "step": 17800 }, { "entropy": 0.06873761809001736, "epoch": 1.5718820961661413, "grad_norm": 2.078125, "learning_rate": 9.711100122472908e-05, "loss": 0.10108554840087891, "mean_token_accuracy": 0.9776660186052323, "num_tokens": 18826048.0, "step": 17825 }, { "entropy": 0.048909812513666114, "epoch": 1.5740867303071056, "grad_norm": 2.53125, "learning_rate": 9.6873102320888e-05, "loss": 0.06565195560455322, "mean_token_accuracy": 0.9847191327810287, "num_tokens": 18852389.0, "step": 17850 }, { "entropy": 0.05157642891848809, "epoch": 1.5762913644480698, "grad_norm": 1.171875, "learning_rate": 9.663522113002844e-05, "loss": 0.08032276153564454, "mean_token_accuracy": 0.9816439139842987, "num_tokens": 18879043.0, "step": 17875 }, { "entropy": 0.048102616354735804, "epoch": 1.578495998589034, "grad_norm": 0.44140625, "learning_rate": 9.639735899967931e-05, "loss": 0.06498304843902587, "mean_token_accuracy": 0.9843746635317803, "num_tokens": 18905398.0, "step": 17900 }, { "entropy": 0.055048747494947745, "epoch": 1.5807006327299984, "grad_norm": 0.53515625, "learning_rate": 9.615951727726162e-05, "loss": 0.07438690662384033, "mean_token_accuracy": 0.981828630566597, "num_tokens": 18932068.0, "step": 17925 }, { "entropy": 0.05669259457528824, "epoch": 1.582905266870963, "grad_norm": 1.5703125, "learning_rate": 9.592169731008076e-05, "loss": 0.07773910522460938, "mean_token_accuracy": 0.9835181996226311, "num_tokens": 18959147.0, "step": 17950 }, { "entropy": 0.04661863596928015, "epoch": 1.585109901011927, "grad_norm": 0.54296875, "learning_rate": 9.568390044531887e-05, "loss": 0.06266080856323242, "mean_token_accuracy": 0.9864483639597893, "num_tokens": 18984022.0, "step": 17975 }, { "entropy": 0.05930834055492596, "epoch": 1.5873145351528914, "grad_norm": 0.70703125, "learning_rate": 9.544612803002711e-05, "loss": 0.09378492355346679, "mean_token_accuracy": 0.9771989992260933, "num_tokens": 19011238.0, "step": 18000 }, { "epoch": 1.5873145351528914, "eval_entropy": 0.025065774224757285, "eval_loss": 0.030993424355983734, "eval_mean_token_accuracy": 0.9900934713175435, "eval_num_tokens": 19011238.0, "eval_runtime": 247.7334, "eval_samples_per_second": 15.86, "eval_steps_per_second": 3.968, "step": 18000 }, { "entropy": 0.05400558569199347, "epoch": 1.5895191692938557, "grad_norm": 3.8125, "learning_rate": 9.520838141111833e-05, "loss": 0.07712131023406982, "mean_token_accuracy": 0.9816086456179619, "num_tokens": 19037710.0, "step": 18025 }, { "entropy": 0.05443809665448498, "epoch": 1.59172380343482, "grad_norm": 0.26953125, "learning_rate": 9.497066193535917e-05, "loss": 0.08530388832092285, "mean_token_accuracy": 0.981157968044281, "num_tokens": 19064722.0, "step": 18050 }, { "entropy": 0.05699371125425387, "epoch": 1.5939284375757843, "grad_norm": 2.5, "learning_rate": 9.473297094936247e-05, "loss": 0.07788813591003418, "mean_token_accuracy": 0.9828856268525124, "num_tokens": 19090316.0, "step": 18075 }, { "entropy": 0.059593895952129966, "epoch": 1.5961330717167486, "grad_norm": 1.359375, "learning_rate": 9.449530979957977e-05, "loss": 0.07819770812988282, "mean_token_accuracy": 0.9822254714369774, "num_tokens": 19117146.0, "step": 18100 }, { "entropy": 0.06011313027673168, "epoch": 1.598337705857713, "grad_norm": 0.9453125, "learning_rate": 9.425767983229346e-05, "loss": 0.07340410232543945, "mean_token_accuracy": 0.9806813663244247, "num_tokens": 19144537.0, "step": 18125 }, { "entropy": 0.04968562399721122, "epoch": 1.6005423399986771, "grad_norm": 0.875, "learning_rate": 9.402008239360944e-05, "loss": 0.06933138847351074, "mean_token_accuracy": 0.9852479723095894, "num_tokens": 19170209.0, "step": 18150 }, { "entropy": 0.060237830583500906, "epoch": 1.6027469741396416, "grad_norm": 0.384765625, "learning_rate": 9.378251882944932e-05, "loss": 0.07463406085968018, "mean_token_accuracy": 0.9815969231724739, "num_tokens": 19196223.0, "step": 18175 }, { "entropy": 0.05468057305733964, "epoch": 1.6049516082806057, "grad_norm": 0.984375, "learning_rate": 9.354499048554273e-05, "loss": 0.07429322242736816, "mean_token_accuracy": 0.9817409634590148, "num_tokens": 19222714.0, "step": 18200 }, { "entropy": 0.050176919560835816, "epoch": 1.6071562424215702, "grad_norm": 1.0, "learning_rate": 9.33074987074198e-05, "loss": 0.07442711353302002, "mean_token_accuracy": 0.9835793408751488, "num_tokens": 19248965.0, "step": 18225 }, { "entropy": 0.05845318389037857, "epoch": 1.6093608765625345, "grad_norm": 3.171875, "learning_rate": 9.307004484040361e-05, "loss": 0.07511641502380371, "mean_token_accuracy": 0.9822868245840073, "num_tokens": 19274620.0, "step": 18250 }, { "entropy": 0.057679404538066595, "epoch": 1.6115655107034987, "grad_norm": 0.5390625, "learning_rate": 9.28326302296025e-05, "loss": 0.08816563606262207, "mean_token_accuracy": 0.9820525646209717, "num_tokens": 19301416.0, "step": 18275 }, { "entropy": 0.05478143262356752, "epoch": 1.613770144844463, "grad_norm": 2.703125, "learning_rate": 9.259525621990227e-05, "loss": 0.0690168571472168, "mean_token_accuracy": 0.9814907485246658, "num_tokens": 19328100.0, "step": 18300 }, { "entropy": 0.04510672772856197, "epoch": 1.6159747789854273, "grad_norm": 0.384765625, "learning_rate": 9.235792415595887e-05, "loss": 0.062098612785339354, "mean_token_accuracy": 0.9843040198087692, "num_tokens": 19353794.0, "step": 18325 }, { "entropy": 0.058022860687196955, "epoch": 1.6181794131263918, "grad_norm": 3.0, "learning_rate": 9.212063538219059e-05, "loss": 0.07701887130737305, "mean_token_accuracy": 0.9803284251689911, "num_tokens": 19381164.0, "step": 18350 }, { "entropy": 0.0572402674825571, "epoch": 1.6203840472673559, "grad_norm": 1.0546875, "learning_rate": 9.188339124277056e-05, "loss": 0.09371196746826171, "mean_token_accuracy": 0.9804246252775193, "num_tokens": 19408403.0, "step": 18375 }, { "entropy": 0.056358818953158335, "epoch": 1.6225886814083204, "grad_norm": 1.34375, "learning_rate": 9.164619308161894e-05, "loss": 0.07747916221618652, "mean_token_accuracy": 0.9822709369659424, "num_tokens": 19435431.0, "step": 18400 }, { "entropy": 0.05615487261035014, "epoch": 1.6247933155492846, "grad_norm": 0.65234375, "learning_rate": 9.140904224239555e-05, "loss": 0.0793535327911377, "mean_token_accuracy": 0.9820253443717957, "num_tokens": 19461362.0, "step": 18425 }, { "entropy": 0.04601802124358073, "epoch": 1.626997949690249, "grad_norm": 0.462890625, "learning_rate": 9.117194006849207e-05, "loss": 0.057888431549072264, "mean_token_accuracy": 0.9839033776521683, "num_tokens": 19487621.0, "step": 18450 }, { "entropy": 0.05517784896073863, "epoch": 1.6292025838312132, "grad_norm": 3.265625, "learning_rate": 9.09348879030246e-05, "loss": 0.06926989078521728, "mean_token_accuracy": 0.9825824412703514, "num_tokens": 19514533.0, "step": 18475 }, { "entropy": 0.05407030944246799, "epoch": 1.6314072179721775, "grad_norm": 1.6875, "learning_rate": 9.069788708882582e-05, "loss": 0.06324063777923584, "mean_token_accuracy": 0.9833697432279587, "num_tokens": 19540686.0, "step": 18500 }, { "entropy": 0.049242617936106396, "epoch": 1.633611852113142, "grad_norm": 0.98046875, "learning_rate": 9.046093896843764e-05, "loss": 0.07035034656524658, "mean_token_accuracy": 0.982454896569252, "num_tokens": 19566734.0, "step": 18525 }, { "entropy": 0.05330205393038341, "epoch": 1.635816486254106, "grad_norm": 1.6484375, "learning_rate": 9.02240448841034e-05, "loss": 0.07815911769866943, "mean_token_accuracy": 0.9762992030382156, "num_tokens": 19592970.0, "step": 18550 }, { "entropy": 0.060869988883641785, "epoch": 1.6380211203950705, "grad_norm": 0.875, "learning_rate": 8.998720617776044e-05, "loss": 0.08916851043701172, "mean_token_accuracy": 0.9811286148428917, "num_tokens": 19619172.0, "step": 18575 }, { "entropy": 0.06743482278645388, "epoch": 1.6402257545360346, "grad_norm": 1.9296875, "learning_rate": 8.975042419103222e-05, "loss": 0.09006739616394042, "mean_token_accuracy": 0.9787856066226959, "num_tokens": 19645154.0, "step": 18600 }, { "entropy": 0.06285342701012268, "epoch": 1.642430388676999, "grad_norm": 1.7265625, "learning_rate": 8.951370026522109e-05, "loss": 0.0924040412902832, "mean_token_accuracy": 0.9793640455603599, "num_tokens": 19672192.0, "step": 18625 }, { "entropy": 0.05975484314971254, "epoch": 1.6446350228179634, "grad_norm": 3.296875, "learning_rate": 8.92770357413004e-05, "loss": 0.07495357990264892, "mean_token_accuracy": 0.982607415318489, "num_tokens": 19698226.0, "step": 18650 }, { "entropy": 0.04880306238286721, "epoch": 1.6468396569589276, "grad_norm": 2.015625, "learning_rate": 8.904043195990707e-05, "loss": 0.06099768161773682, "mean_token_accuracy": 0.9863478910923004, "num_tokens": 19724505.0, "step": 18675 }, { "entropy": 0.05574701564350107, "epoch": 1.649044291099892, "grad_norm": 2.65625, "learning_rate": 8.880389026133378e-05, "loss": 0.09057734489440918, "mean_token_accuracy": 0.9812796249985695, "num_tokens": 19750809.0, "step": 18700 }, { "entropy": 0.05721397206405527, "epoch": 1.6512489252408562, "grad_norm": 0.87890625, "learning_rate": 8.856741198552171e-05, "loss": 0.09432357788085938, "mean_token_accuracy": 0.9806243237853051, "num_tokens": 19776758.0, "step": 18725 }, { "entropy": 0.051816781606976295, "epoch": 1.6534535593818207, "grad_norm": 4.65625, "learning_rate": 8.83309984720527e-05, "loss": 0.0762649917602539, "mean_token_accuracy": 0.9829356080293655, "num_tokens": 19801783.0, "step": 18750 }, { "entropy": 0.05103291466111841, "epoch": 1.6556581935227848, "grad_norm": 3.171875, "learning_rate": 8.809465106014173e-05, "loss": 0.07494559288024902, "mean_token_accuracy": 0.9815609979629517, "num_tokens": 19828390.0, "step": 18775 }, { "entropy": 0.05235413030990457, "epoch": 1.6578628276637493, "grad_norm": 0.67578125, "learning_rate": 8.785837108862926e-05, "loss": 0.06882952213287354, "mean_token_accuracy": 0.9843146482110023, "num_tokens": 19854446.0, "step": 18800 }, { "entropy": 0.050876114333077566, "epoch": 1.6600674618047135, "grad_norm": 1.7734375, "learning_rate": 8.762215989597383e-05, "loss": 0.07045453071594238, "mean_token_accuracy": 0.9855261752009392, "num_tokens": 19879183.0, "step": 18825 }, { "entropy": 0.04869757466043666, "epoch": 1.6622720959456778, "grad_norm": 1.1875, "learning_rate": 8.738601882024435e-05, "loss": 0.0628709602355957, "mean_token_accuracy": 0.9841243025660514, "num_tokens": 19905109.0, "step": 18850 }, { "entropy": 0.04798712164454628, "epoch": 1.664476730086642, "grad_norm": 0.6953125, "learning_rate": 8.714994919911247e-05, "loss": 0.07689383506774902, "mean_token_accuracy": 0.9856678175926209, "num_tokens": 19932085.0, "step": 18875 }, { "entropy": 0.05964564758811321, "epoch": 1.6666813642276064, "grad_norm": 2.328125, "learning_rate": 8.691395236984516e-05, "loss": 0.08691888809204101, "mean_token_accuracy": 0.9784200477600098, "num_tokens": 19960684.0, "step": 18900 }, { "entropy": 0.05663166692873347, "epoch": 1.6688859983685709, "grad_norm": 2.46875, "learning_rate": 8.667802966929694e-05, "loss": 0.06777063846588134, "mean_token_accuracy": 0.9836448773741722, "num_tokens": 19986876.0, "step": 18925 }, { "entropy": 0.04777500177915499, "epoch": 1.671090632509535, "grad_norm": 0.1982421875, "learning_rate": 8.644218243390248e-05, "loss": 0.06499977588653565, "mean_token_accuracy": 0.9848168486356735, "num_tokens": 20012694.0, "step": 18950 }, { "entropy": 0.04975124435048201, "epoch": 1.6732952666504994, "grad_norm": 1.8515625, "learning_rate": 8.620641199966901e-05, "loss": 0.06578896045684815, "mean_token_accuracy": 0.9852525860071182, "num_tokens": 20039158.0, "step": 18975 }, { "entropy": 0.06006684680622129, "epoch": 1.6754999007914635, "grad_norm": 1.4375, "learning_rate": 8.597071970216861e-05, "loss": 0.08604469299316406, "mean_token_accuracy": 0.9798215964436531, "num_tokens": 20066712.0, "step": 19000 }, { "epoch": 1.6754999007914635, "eval_entropy": 0.02392154005389293, "eval_loss": 0.027977393940091133, "eval_mean_token_accuracy": 0.9912456352858674, "eval_num_tokens": 20066712.0, "eval_runtime": 244.0079, "eval_samples_per_second": 16.102, "eval_steps_per_second": 4.029, "step": 19000 }, { "entropy": 0.0617493161274615, "epoch": 1.677704534932428, "grad_norm": 0.7109375, "learning_rate": 8.573510687653072e-05, "loss": 0.08513182640075684, "mean_token_accuracy": 0.9791978281736374, "num_tokens": 20093148.0, "step": 19025 }, { "entropy": 0.058141036685192375, "epoch": 1.6799091690733923, "grad_norm": 1.1171875, "learning_rate": 8.549957485743467e-05, "loss": 0.0842889404296875, "mean_token_accuracy": 0.9809462291002273, "num_tokens": 20119244.0, "step": 19050 }, { "entropy": 0.062483344532229236, "epoch": 1.6821138032143566, "grad_norm": 3.921875, "learning_rate": 8.526412497910208e-05, "loss": 0.08513738632202149, "mean_token_accuracy": 0.9823807805776597, "num_tokens": 20145783.0, "step": 19075 }, { "entropy": 0.047624338713576436, "epoch": 1.6843184373553208, "grad_norm": 3.453125, "learning_rate": 8.502875857528916e-05, "loss": 0.05738876819610596, "mean_token_accuracy": 0.9847044974565506, "num_tokens": 20173172.0, "step": 19100 }, { "entropy": 0.05098998494475381, "epoch": 1.686523071496285, "grad_norm": 1.8359375, "learning_rate": 8.47934769792793e-05, "loss": 0.06551202774047851, "mean_token_accuracy": 0.9852928957343101, "num_tokens": 20200153.0, "step": 19125 }, { "entropy": 0.057541024469246624, "epoch": 1.6887277056372496, "grad_norm": 1.375, "learning_rate": 8.45582815238755e-05, "loss": 0.08337347984313964, "mean_token_accuracy": 0.9810384732484817, "num_tokens": 20228509.0, "step": 19150 }, { "entropy": 0.052381179301446534, "epoch": 1.6909323397782137, "grad_norm": 2.453125, "learning_rate": 8.432317354139276e-05, "loss": 0.07328851222991943, "mean_token_accuracy": 0.9824663576483726, "num_tokens": 20254949.0, "step": 19175 }, { "entropy": 0.057044620190863496, "epoch": 1.6931369739191782, "grad_norm": 0.70703125, "learning_rate": 8.408815436365066e-05, "loss": 0.06466145038604737, "mean_token_accuracy": 0.9842689520120621, "num_tokens": 20282270.0, "step": 19200 }, { "entropy": 0.044989108476002, "epoch": 1.6953416080601424, "grad_norm": 0.310546875, "learning_rate": 8.38532253219656e-05, "loss": 0.05739119529724121, "mean_token_accuracy": 0.9873190036416054, "num_tokens": 20308285.0, "step": 19225 }, { "entropy": 0.05179535857627343, "epoch": 1.6975462422011067, "grad_norm": 1.4609375, "learning_rate": 8.361838774714343e-05, "loss": 0.07189542770385743, "mean_token_accuracy": 0.9823496028780937, "num_tokens": 20335541.0, "step": 19250 }, { "entropy": 0.04700690143959946, "epoch": 1.699750876342071, "grad_norm": 1.7109375, "learning_rate": 8.338364296947192e-05, "loss": 0.06394123077392579, "mean_token_accuracy": 0.983540931046009, "num_tokens": 20361464.0, "step": 19275 }, { "entropy": 0.05009625876868085, "epoch": 1.7019555104830353, "grad_norm": 1.1015625, "learning_rate": 8.314899231871316e-05, "loss": 0.0703705358505249, "mean_token_accuracy": 0.9844644451141358, "num_tokens": 20387924.0, "step": 19300 }, { "entropy": 0.05482434216442925, "epoch": 1.7041601446239998, "grad_norm": 0.2216796875, "learning_rate": 8.291443712409595e-05, "loss": 0.0688732099533081, "mean_token_accuracy": 0.9761506044864654, "num_tokens": 20414494.0, "step": 19325 }, { "entropy": 0.05794863952425658, "epoch": 1.7063647787649638, "grad_norm": 2.71875, "learning_rate": 8.267997871430844e-05, "loss": 0.07615211009979247, "mean_token_accuracy": 0.9809840077161789, "num_tokens": 20440990.0, "step": 19350 }, { "entropy": 0.03766856579939486, "epoch": 1.7085694129059283, "grad_norm": 1.1171875, "learning_rate": 8.244561841749048e-05, "loss": 0.0410503339767456, "mean_token_accuracy": 0.98747672945261, "num_tokens": 20466090.0, "step": 19375 }, { "entropy": 0.048852927869920675, "epoch": 1.7107740470468926, "grad_norm": 2.59375, "learning_rate": 8.221135756122625e-05, "loss": 0.0601063871383667, "mean_token_accuracy": 0.9837045565247535, "num_tokens": 20492618.0, "step": 19400 }, { "entropy": 0.044831587293992926, "epoch": 1.712978681187857, "grad_norm": 1.34375, "learning_rate": 8.19771974725364e-05, "loss": 0.05813504695892334, "mean_token_accuracy": 0.9861867216229439, "num_tokens": 20518674.0, "step": 19425 }, { "entropy": 0.059993671221309344, "epoch": 1.7151833153288212, "grad_norm": 1.9609375, "learning_rate": 8.1743139477871e-05, "loss": 0.07705019474029541, "mean_token_accuracy": 0.9792517521977424, "num_tokens": 20545279.0, "step": 19450 }, { "entropy": 0.05164817209908506, "epoch": 1.7173879494697855, "grad_norm": 0.96875, "learning_rate": 8.150918490310163e-05, "loss": 0.060844712257385254, "mean_token_accuracy": 0.9835267442464829, "num_tokens": 20572166.0, "step": 19475 }, { "entropy": 0.0489783468087262, "epoch": 1.71959258361075, "grad_norm": 2.125, "learning_rate": 8.127533507351415e-05, "loss": 0.06445716381072998, "mean_token_accuracy": 0.9832936239242553, "num_tokens": 20598836.0, "step": 19500 }, { "entropy": 0.052891870438106704, "epoch": 1.721797217751714, "grad_norm": 1.4375, "learning_rate": 8.104159131380089e-05, "loss": 0.07390643119812011, "mean_token_accuracy": 0.9825085845589637, "num_tokens": 20625740.0, "step": 19525 }, { "entropy": 0.04920050672066281, "epoch": 1.7240018518926785, "grad_norm": 1.578125, "learning_rate": 8.080795494805351e-05, "loss": 0.06204095840454102, "mean_token_accuracy": 0.9850746482610703, "num_tokens": 20652602.0, "step": 19550 }, { "entropy": 0.053812768571224294, "epoch": 1.7262064860336426, "grad_norm": 1.2578125, "learning_rate": 8.057442729975518e-05, "loss": 0.07690982341766357, "mean_token_accuracy": 0.9800240156054497, "num_tokens": 20678867.0, "step": 19575 }, { "entropy": 0.04677354415842274, "epoch": 1.728411120174607, "grad_norm": 0.8359375, "learning_rate": 8.034100969177337e-05, "loss": 0.05719516754150391, "mean_token_accuracy": 0.9842693316936493, "num_tokens": 20705807.0, "step": 19600 }, { "entropy": 0.048505159118867595, "epoch": 1.7306157543155714, "grad_norm": 0.7578125, "learning_rate": 8.010770344635199e-05, "loss": 0.06632734775543213, "mean_token_accuracy": 0.984979897737503, "num_tokens": 20733283.0, "step": 19625 }, { "entropy": 0.04785260035292595, "epoch": 1.7328203884565356, "grad_norm": 2.890625, "learning_rate": 7.987450988510427e-05, "loss": 0.0693046760559082, "mean_token_accuracy": 0.9795151484012604, "num_tokens": 20760048.0, "step": 19650 }, { "entropy": 0.047811845683754656, "epoch": 1.7350250225975, "grad_norm": 2.140625, "learning_rate": 7.964143032900513e-05, "loss": 0.07035356521606445, "mean_token_accuracy": 0.9821845233440399, "num_tokens": 20784985.0, "step": 19675 }, { "entropy": 0.04927277403214248, "epoch": 1.7372296567384642, "grad_norm": 0.369140625, "learning_rate": 7.94084660983836e-05, "loss": 0.06305365085601806, "mean_token_accuracy": 0.9862342929840088, "num_tokens": 20811868.0, "step": 19700 }, { "entropy": 0.05829433169667027, "epoch": 1.7394342908794287, "grad_norm": 0.302734375, "learning_rate": 7.917561851291538e-05, "loss": 0.0748593282699585, "mean_token_accuracy": 0.9797840613126755, "num_tokens": 20839057.0, "step": 19725 }, { "entropy": 0.04679221799589868, "epoch": 1.7416389250203927, "grad_norm": 1.234375, "learning_rate": 7.894288889161554e-05, "loss": 0.06042766571044922, "mean_token_accuracy": 0.9846546384692192, "num_tokens": 20864043.0, "step": 19750 }, { "entropy": 0.05859222490020329, "epoch": 1.7438435591613572, "grad_norm": 0.703125, "learning_rate": 7.871027855283088e-05, "loss": 0.09572078704833985, "mean_token_accuracy": 0.9796414789557457, "num_tokens": 20891095.0, "step": 19775 }, { "entropy": 0.0481031046868884, "epoch": 1.7460481933023215, "grad_norm": 1.0859375, "learning_rate": 7.847778881423247e-05, "loss": 0.06286589622497558, "mean_token_accuracy": 0.9845766887068749, "num_tokens": 20916691.0, "step": 19800 }, { "entropy": 0.048571501801998235, "epoch": 1.7482528274432858, "grad_norm": 1.5859375, "learning_rate": 7.824542099280817e-05, "loss": 0.06077055931091309, "mean_token_accuracy": 0.9858067938685418, "num_tokens": 20943403.0, "step": 19825 }, { "entropy": 0.053936819200171154, "epoch": 1.75045746158425, "grad_norm": 3.109375, "learning_rate": 7.801317640485528e-05, "loss": 0.06340930938720703, "mean_token_accuracy": 0.9828820812702179, "num_tokens": 20969337.0, "step": 19850 }, { "entropy": 0.05901100598319317, "epoch": 1.7526620957252144, "grad_norm": 2.359375, "learning_rate": 7.778105636597305e-05, "loss": 0.08228662490844726, "mean_token_accuracy": 0.9798002752661705, "num_tokens": 20997335.0, "step": 19875 }, { "entropy": 0.04519621968145657, "epoch": 1.7548667298661789, "grad_norm": 0.296875, "learning_rate": 7.75490621910551e-05, "loss": 0.05941192150115967, "mean_token_accuracy": 0.9843781432509422, "num_tokens": 21024018.0, "step": 19900 }, { "entropy": 0.054847391085204436, "epoch": 1.757071364007143, "grad_norm": 0.361328125, "learning_rate": 7.731719519428217e-05, "loss": 0.0830865478515625, "mean_token_accuracy": 0.9809887805581092, "num_tokens": 21050891.0, "step": 19925 }, { "entropy": 0.06024816707678838, "epoch": 1.7592759981481074, "grad_norm": 1.09375, "learning_rate": 7.708545668911443e-05, "loss": 0.0814453887939453, "mean_token_accuracy": 0.9796580225229263, "num_tokens": 21079502.0, "step": 19950 }, { "entropy": 0.056985975124116524, "epoch": 1.7614806322890715, "grad_norm": 0.408203125, "learning_rate": 7.685384798828432e-05, "loss": 0.07116193771362304, "mean_token_accuracy": 0.9817749384045601, "num_tokens": 21105896.0, "step": 19975 }, { "entropy": 0.04627459083298163, "epoch": 1.763685266430036, "grad_norm": 1.1875, "learning_rate": 7.662237040378895e-05, "loss": 0.06480787754058838, "mean_token_accuracy": 0.9857170182466507, "num_tokens": 21132663.0, "step": 20000 }, { "epoch": 1.763685266430036, "eval_entropy": 0.023058283942197815, "eval_loss": 0.026535965502262115, "eval_mean_token_accuracy": 0.9916715127162992, "eval_num_tokens": 21132663.0, "eval_runtime": 242.6864, "eval_samples_per_second": 16.19, "eval_steps_per_second": 4.05, "step": 20000 }, { "entropy": 0.04721232044874341, "epoch": 1.7658899005710003, "grad_norm": 0.99609375, "learning_rate": 7.639102524688265e-05, "loss": 0.05121193885803223, "mean_token_accuracy": 0.9868710726499558, "num_tokens": 21158041.0, "step": 20025 }, { "entropy": 0.04733763427138911, "epoch": 1.7680945347119645, "grad_norm": 2.453125, "learning_rate": 7.615981382806956e-05, "loss": 0.06285818576812745, "mean_token_accuracy": 0.9839335259795189, "num_tokens": 21183629.0, "step": 20050 }, { "entropy": 0.04657574629578448, "epoch": 1.7702991688529288, "grad_norm": 3.3125, "learning_rate": 7.59287374570963e-05, "loss": 0.07313352108001708, "mean_token_accuracy": 0.984621779024601, "num_tokens": 21209670.0, "step": 20075 }, { "entropy": 0.04935747715455364, "epoch": 1.772503802993893, "grad_norm": 1.203125, "learning_rate": 7.569779744294447e-05, "loss": 0.06564640998840332, "mean_token_accuracy": 0.9850915068387985, "num_tokens": 21235640.0, "step": 20100 }, { "entropy": 0.04392121256401879, "epoch": 1.7747084371348576, "grad_norm": 0.9453125, "learning_rate": 7.546699509382324e-05, "loss": 0.06469783306121826, "mean_token_accuracy": 0.9833244362473488, "num_tokens": 21261662.0, "step": 20125 }, { "entropy": 0.04485419812590408, "epoch": 1.7769130712758217, "grad_norm": 1.8125, "learning_rate": 7.523633171716194e-05, "loss": 0.05138749122619629, "mean_token_accuracy": 0.985532431602478, "num_tokens": 21288209.0, "step": 20150 }, { "entropy": 0.046874258878851834, "epoch": 1.7791177054167862, "grad_norm": 1.75, "learning_rate": 7.50058086196026e-05, "loss": 0.06949950218200683, "mean_token_accuracy": 0.9838392865657807, "num_tokens": 21314032.0, "step": 20175 }, { "entropy": 0.054706107557649375, "epoch": 1.7813223395577504, "grad_norm": 2.59375, "learning_rate": 7.477542710699275e-05, "loss": 0.07922077655792237, "mean_token_accuracy": 0.9834457024931907, "num_tokens": 21339944.0, "step": 20200 }, { "entropy": 0.05576091543363873, "epoch": 1.7835269736987147, "grad_norm": 1.859375, "learning_rate": 7.454518848437782e-05, "loss": 0.07751681327819825, "mean_token_accuracy": 0.9836757770180702, "num_tokens": 21366032.0, "step": 20225 }, { "entropy": 0.06782450357182825, "epoch": 1.785731607839679, "grad_norm": 1.3671875, "learning_rate": 7.43150940559937e-05, "loss": 0.09584000587463379, "mean_token_accuracy": 0.9782721415162087, "num_tokens": 21393287.0, "step": 20250 }, { "entropy": 0.050107013575034214, "epoch": 1.7879362419806433, "grad_norm": 2.734375, "learning_rate": 7.408514512525961e-05, "loss": 0.06766324996948242, "mean_token_accuracy": 0.9848771205544472, "num_tokens": 21419444.0, "step": 20275 }, { "entropy": 0.04845013963735255, "epoch": 1.7901408761216078, "grad_norm": 1.1953125, "learning_rate": 7.385534299477049e-05, "loss": 0.0615593433380127, "mean_token_accuracy": 0.9860979354381562, "num_tokens": 21444863.0, "step": 20300 }, { "entropy": 0.04458080105272529, "epoch": 1.7923455102625718, "grad_norm": 1.4296875, "learning_rate": 7.362568896628977e-05, "loss": 0.05469425678253174, "mean_token_accuracy": 0.9860287711024285, "num_tokens": 21471190.0, "step": 20325 }, { "entropy": 0.048252884604953576, "epoch": 1.7945501444035363, "grad_norm": 1.828125, "learning_rate": 7.339618434074182e-05, "loss": 0.07198523044586182, "mean_token_accuracy": 0.9825797024369239, "num_tokens": 21497177.0, "step": 20350 }, { "entropy": 0.0426078699176287, "epoch": 1.7967547785445004, "grad_norm": 0.9453125, "learning_rate": 7.316683041820474e-05, "loss": 0.07130131244659424, "mean_token_accuracy": 0.9853621581196785, "num_tokens": 21521844.0, "step": 20375 }, { "entropy": 0.04121265197856701, "epoch": 1.7989594126854649, "grad_norm": 0.419921875, "learning_rate": 7.293762849790294e-05, "loss": 0.055575294494628905, "mean_token_accuracy": 0.9851228359341622, "num_tokens": 21546943.0, "step": 20400 }, { "entropy": 0.04701503006057464, "epoch": 1.8011640468264292, "grad_norm": 0.21875, "learning_rate": 7.270857987819984e-05, "loss": 0.061670899391174316, "mean_token_accuracy": 0.9868606904149055, "num_tokens": 21572935.0, "step": 20425 }, { "entropy": 0.04451014851059881, "epoch": 1.8033686809673934, "grad_norm": 3.21875, "learning_rate": 7.247968585659032e-05, "loss": 0.060894203186035153, "mean_token_accuracy": 0.9842220428586006, "num_tokens": 21599217.0, "step": 20450 }, { "entropy": 0.059572365196509054, "epoch": 1.8055733151083577, "grad_norm": 2.34375, "learning_rate": 7.225094772969361e-05, "loss": 0.07416722774505616, "mean_token_accuracy": 0.9822189456224442, "num_tokens": 21626925.0, "step": 20475 }, { "entropy": 0.046611685622046935, "epoch": 1.807777949249322, "grad_norm": 0.828125, "learning_rate": 7.202236679324581e-05, "loss": 0.06417181015014649, "mean_token_accuracy": 0.9834126874804496, "num_tokens": 21653311.0, "step": 20500 }, { "entropy": 0.04335411591440788, "epoch": 1.8099825833902865, "grad_norm": 0.8359375, "learning_rate": 7.179394434209264e-05, "loss": 0.058134937286376955, "mean_token_accuracy": 0.9870638877153397, "num_tokens": 21677976.0, "step": 20525 }, { "entropy": 0.05337048692628741, "epoch": 1.8121872175312506, "grad_norm": 1.1796875, "learning_rate": 7.156568167018187e-05, "loss": 0.06563668727874755, "mean_token_accuracy": 0.9823853915929794, "num_tokens": 21705733.0, "step": 20550 }, { "entropy": 0.03864829636509967, "epoch": 1.814391851672215, "grad_norm": 0.0947265625, "learning_rate": 7.133758007055639e-05, "loss": 0.05145081520080566, "mean_token_accuracy": 0.9869194403290749, "num_tokens": 21731242.0, "step": 20575 }, { "entropy": 0.0520756857453307, "epoch": 1.8165964858131793, "grad_norm": 9.375, "learning_rate": 7.110964083534651e-05, "loss": 0.07184930324554444, "mean_token_accuracy": 0.9811625573039054, "num_tokens": 21757526.0, "step": 20600 }, { "entropy": 0.0557931135634135, "epoch": 1.8188011199541436, "grad_norm": 0.74609375, "learning_rate": 7.088186525576289e-05, "loss": 0.07704683303833008, "mean_token_accuracy": 0.9804368880391121, "num_tokens": 21783946.0, "step": 20625 }, { "entropy": 0.04322621882762178, "epoch": 1.821005754095108, "grad_norm": 0.1494140625, "learning_rate": 7.0654254622089e-05, "loss": 0.05452206611633301, "mean_token_accuracy": 0.9864040830731392, "num_tokens": 21809770.0, "step": 20650 }, { "entropy": 0.04771335199315217, "epoch": 1.8232103882360722, "grad_norm": 0.59375, "learning_rate": 7.042681022367406e-05, "loss": 0.06347963333129883, "mean_token_accuracy": 0.9841774880886078, "num_tokens": 21835665.0, "step": 20675 }, { "entropy": 0.053830798321432664, "epoch": 1.8254150223770367, "grad_norm": 1.8515625, "learning_rate": 7.019953334892557e-05, "loss": 0.07136422634124756, "mean_token_accuracy": 0.9813834890723229, "num_tokens": 21862527.0, "step": 20700 }, { "entropy": 0.04650898744526785, "epoch": 1.8276196565180007, "grad_norm": 1.7109375, "learning_rate": 6.99724252853021e-05, "loss": 0.05398883819580078, "mean_token_accuracy": 0.9853073519468307, "num_tokens": 21889036.0, "step": 20725 }, { "entropy": 0.05225651402957737, "epoch": 1.8298242906589652, "grad_norm": 1.1328125, "learning_rate": 6.974548731930582e-05, "loss": 0.06863418102264404, "mean_token_accuracy": 0.9822486186027527, "num_tokens": 21916165.0, "step": 20750 }, { "entropy": 0.055327791996278394, "epoch": 1.8320289247999293, "grad_norm": 1.8203125, "learning_rate": 6.951872073647546e-05, "loss": 0.06615938663482666, "mean_token_accuracy": 0.9840937641263008, "num_tokens": 21944120.0, "step": 20775 }, { "entropy": 0.04494785757124191, "epoch": 1.8342335589408938, "grad_norm": 2.265625, "learning_rate": 6.929212682137896e-05, "loss": 0.05958599090576172, "mean_token_accuracy": 0.9841647908091545, "num_tokens": 21970158.0, "step": 20800 }, { "entropy": 0.04708563433858217, "epoch": 1.836438193081858, "grad_norm": 1.375, "learning_rate": 6.906570685760602e-05, "loss": 0.05993311405181885, "mean_token_accuracy": 0.9865586760640145, "num_tokens": 21995764.0, "step": 20825 }, { "entropy": 0.05176374662743911, "epoch": 1.8386428272228224, "grad_norm": 0.58203125, "learning_rate": 6.8839462127761e-05, "loss": 0.07699875831604004, "mean_token_accuracy": 0.9810626646876335, "num_tokens": 22022035.0, "step": 20850 }, { "entropy": 0.045616496206566805, "epoch": 1.8408474613637866, "grad_norm": 2.5, "learning_rate": 6.861339391345563e-05, "loss": 0.06166870594024658, "mean_token_accuracy": 0.9853792524337769, "num_tokens": 22048310.0, "step": 20875 }, { "entropy": 0.05039491361167166, "epoch": 1.843052095504751, "grad_norm": 1.6875, "learning_rate": 6.838750349530175e-05, "loss": 0.06637926578521729, "mean_token_accuracy": 0.9829424172639847, "num_tokens": 22074551.0, "step": 20900 }, { "entropy": 0.04769486486184178, "epoch": 1.8452567296457154, "grad_norm": 1.0234375, "learning_rate": 6.8161792152904e-05, "loss": 0.05665205478668213, "mean_token_accuracy": 0.9840876114368439, "num_tokens": 22100230.0, "step": 20925 }, { "entropy": 0.0528660134456004, "epoch": 1.8474613637866795, "grad_norm": 1.546875, "learning_rate": 6.793626116485261e-05, "loss": 0.07378475189208984, "mean_token_accuracy": 0.9837013658881187, "num_tokens": 22125633.0, "step": 20950 }, { "entropy": 0.05890303898835555, "epoch": 1.849665997927644, "grad_norm": 1.375, "learning_rate": 6.771091180871611e-05, "loss": 0.0943376350402832, "mean_token_accuracy": 0.9790048637986183, "num_tokens": 22152592.0, "step": 20975 }, { "entropy": 0.04344821692240657, "epoch": 1.8518706320686082, "grad_norm": 3.375, "learning_rate": 6.748574536103424e-05, "loss": 0.05888943672180176, "mean_token_accuracy": 0.98219693005085, "num_tokens": 22178877.0, "step": 21000 }, { "epoch": 1.8518706320686082, "eval_entropy": 0.022571654543839585, "eval_loss": 0.025132818147540092, "eval_mean_token_accuracy": 0.992005098508794, "eval_num_tokens": 22178877.0, "eval_runtime": 243.2087, "eval_samples_per_second": 16.155, "eval_steps_per_second": 4.042, "step": 21000 }, { "entropy": 0.0381835427560145, "epoch": 1.8540752662095725, "grad_norm": 0.376953125, "learning_rate": 6.726076309731056e-05, "loss": 0.040510034561157225, "mean_token_accuracy": 0.9873276236653328, "num_tokens": 22202989.0, "step": 21025 }, { "entropy": 0.05477136584289838, "epoch": 1.8562799003505368, "grad_norm": 2.234375, "learning_rate": 6.70359662920053e-05, "loss": 0.06961938858032227, "mean_token_accuracy": 0.9818892487883568, "num_tokens": 22229607.0, "step": 21050 }, { "entropy": 0.060442834978384784, "epoch": 1.858484534491501, "grad_norm": 2.203125, "learning_rate": 6.681135621852803e-05, "loss": 0.08862739562988281, "mean_token_accuracy": 0.9802686884999275, "num_tokens": 22257392.0, "step": 21075 }, { "entropy": 0.04568748531426536, "epoch": 1.8606891686324656, "grad_norm": 2.515625, "learning_rate": 6.658693414923064e-05, "loss": 0.05948817729949951, "mean_token_accuracy": 0.986970128417015, "num_tokens": 22283172.0, "step": 21100 }, { "entropy": 0.05226036834908882, "epoch": 1.8628938027734296, "grad_norm": 1.625, "learning_rate": 6.636270135540004e-05, "loss": 0.06003546714782715, "mean_token_accuracy": 0.984726087152958, "num_tokens": 22310102.0, "step": 21125 }, { "entropy": 0.045168612728011794, "epoch": 1.8650984369143941, "grad_norm": 0.828125, "learning_rate": 6.613865910725088e-05, "loss": 0.05067370414733887, "mean_token_accuracy": 0.9873959225416183, "num_tokens": 22334866.0, "step": 21150 }, { "entropy": 0.05355773056653561, "epoch": 1.8673030710553582, "grad_norm": 1.6875, "learning_rate": 6.591480867391846e-05, "loss": 0.07002598285675049, "mean_token_accuracy": 0.9833967351913452, "num_tokens": 22361930.0, "step": 21175 }, { "entropy": 0.04413744929035602, "epoch": 1.8695077051963227, "grad_norm": 1.40625, "learning_rate": 6.569115132345147e-05, "loss": 0.0611871862411499, "mean_token_accuracy": 0.9877374297380448, "num_tokens": 22388635.0, "step": 21200 }, { "entropy": 0.043995544389399587, "epoch": 1.871712339337287, "grad_norm": 2.40625, "learning_rate": 6.546768832280488e-05, "loss": 0.06620774745941162, "mean_token_accuracy": 0.9850053295493126, "num_tokens": 22415102.0, "step": 21225 }, { "entropy": 0.056586300741619196, "epoch": 1.8739169734782513, "grad_norm": 1.609375, "learning_rate": 6.524442093783278e-05, "loss": 0.08194119453430176, "mean_token_accuracy": 0.9824917709827423, "num_tokens": 22441198.0, "step": 21250 }, { "entropy": 0.05184478218172444, "epoch": 1.8761216076192155, "grad_norm": 0.83203125, "learning_rate": 6.502135043328099e-05, "loss": 0.0703523302078247, "mean_token_accuracy": 0.9819017976522446, "num_tokens": 22467371.0, "step": 21275 }, { "entropy": 0.05208466888478142, "epoch": 1.8783262417601798, "grad_norm": 0.515625, "learning_rate": 6.479847807278016e-05, "loss": 0.07266618251800537, "mean_token_accuracy": 0.9821174338459968, "num_tokens": 22493871.0, "step": 21300 }, { "entropy": 0.046822893920980276, "epoch": 1.8805308759011443, "grad_norm": 1.6015625, "learning_rate": 6.457580511883851e-05, "loss": 0.06494226932525635, "mean_token_accuracy": 0.981035427749157, "num_tokens": 22520520.0, "step": 21325 }, { "entropy": 0.04288673719856888, "epoch": 1.8827355100421084, "grad_norm": 0.859375, "learning_rate": 6.435333283283475e-05, "loss": 0.05355540275573731, "mean_token_accuracy": 0.9868204814195632, "num_tokens": 22547233.0, "step": 21350 }, { "entropy": 0.04752451835520333, "epoch": 1.8849401441830729, "grad_norm": 1.015625, "learning_rate": 6.413106247501069e-05, "loss": 0.06815120220184326, "mean_token_accuracy": 0.9843712577223778, "num_tokens": 22572916.0, "step": 21375 }, { "entropy": 0.04865189905962325, "epoch": 1.8871447783240372, "grad_norm": 1.3125, "learning_rate": 6.390899530446443e-05, "loss": 0.07241939544677735, "mean_token_accuracy": 0.9836265042424202, "num_tokens": 22598245.0, "step": 21400 }, { "entropy": 0.05860245328833116, "epoch": 1.8893494124650014, "grad_norm": 2.375, "learning_rate": 6.368713257914295e-05, "loss": 0.07383595943450928, "mean_token_accuracy": 0.9842527809739113, "num_tokens": 22624338.0, "step": 21425 }, { "entropy": 0.05662819688994205, "epoch": 1.8915540466059657, "grad_norm": 2.0625, "learning_rate": 6.346547555583526e-05, "loss": 0.07304172039031982, "mean_token_accuracy": 0.98199245095253, "num_tokens": 22650941.0, "step": 21450 }, { "entropy": 0.04486946415730927, "epoch": 1.89375868074693, "grad_norm": 0.62109375, "learning_rate": 6.324402549016493e-05, "loss": 0.05117866992950439, "mean_token_accuracy": 0.9867324560880661, "num_tokens": 22676550.0, "step": 21475 }, { "entropy": 0.058360014182981104, "epoch": 1.8959633148878945, "grad_norm": 0.88671875, "learning_rate": 6.302278363658337e-05, "loss": 0.08039702415466309, "mean_token_accuracy": 0.9807369062304496, "num_tokens": 22704290.0, "step": 21500 }, { "entropy": 0.04482670632161898, "epoch": 1.8981679490288585, "grad_norm": 1.53125, "learning_rate": 6.280175124836234e-05, "loss": 0.047524452209472656, "mean_token_accuracy": 0.9890332189202309, "num_tokens": 22729939.0, "step": 21525 }, { "entropy": 0.04213360240697511, "epoch": 1.900372583169823, "grad_norm": 0.7265625, "learning_rate": 6.258092957758727e-05, "loss": 0.04635190963745117, "mean_token_accuracy": 0.987784284055233, "num_tokens": 22755060.0, "step": 21550 }, { "entropy": 0.04406625685893232, "epoch": 1.9025772173107873, "grad_norm": 1.65625, "learning_rate": 6.236031987514968e-05, "loss": 0.06420273780822754, "mean_token_accuracy": 0.979720167517662, "num_tokens": 22780955.0, "step": 21575 }, { "entropy": 0.048322958536518856, "epoch": 1.9047818514517516, "grad_norm": 1.28125, "learning_rate": 6.213992339074052e-05, "loss": 0.07283812046051025, "mean_token_accuracy": 0.9831582528352737, "num_tokens": 22807721.0, "step": 21600 }, { "entropy": 0.04930130613342044, "epoch": 1.9069864855927159, "grad_norm": 0.3125, "learning_rate": 6.191974137284286e-05, "loss": 0.0650008487701416, "mean_token_accuracy": 0.987842877805233, "num_tokens": 22833581.0, "step": 21625 }, { "entropy": 0.056969212190742836, "epoch": 1.9091911197336802, "grad_norm": 1.421875, "learning_rate": 6.169977506872495e-05, "loss": 0.07546248912811279, "mean_token_accuracy": 0.9796876290440559, "num_tokens": 22860516.0, "step": 21650 }, { "entropy": 0.049281568287333356, "epoch": 1.9113957538746447, "grad_norm": 1.1875, "learning_rate": 6.148002572443293e-05, "loss": 0.060806169509887695, "mean_token_accuracy": 0.9853131911158561, "num_tokens": 22887480.0, "step": 21675 }, { "entropy": 0.04275947823371098, "epoch": 1.9136003880156087, "grad_norm": 0.90234375, "learning_rate": 6.126049458478406e-05, "loss": 0.06027640342712402, "mean_token_accuracy": 0.9874812626838684, "num_tokens": 22913669.0, "step": 21700 }, { "entropy": 0.04580010384270281, "epoch": 1.9158050221565732, "grad_norm": 2.921875, "learning_rate": 6.104118289335954e-05, "loss": 0.07557197093963623, "mean_token_accuracy": 0.9834654727578163, "num_tokens": 22939277.0, "step": 21725 }, { "entropy": 0.053979095179820434, "epoch": 1.9180096562975373, "grad_norm": 2.546875, "learning_rate": 6.082209189249737e-05, "loss": 0.08138853073120117, "mean_token_accuracy": 0.9819204857945443, "num_tokens": 22966377.0, "step": 21750 }, { "entropy": 0.05803485995536903, "epoch": 1.9202142904385018, "grad_norm": 1.015625, "learning_rate": 6.060322282328541e-05, "loss": 0.07845424175262451, "mean_token_accuracy": 0.9816239723563194, "num_tokens": 22993866.0, "step": 21775 }, { "entropy": 0.047958312368718906, "epoch": 1.922418924579466, "grad_norm": 2.453125, "learning_rate": 6.038457692555439e-05, "loss": 0.059175658226013186, "mean_token_accuracy": 0.9811293777823448, "num_tokens": 23019960.0, "step": 21800 }, { "entropy": 0.05747593588210293, "epoch": 1.9246235587204303, "grad_norm": 2.125, "learning_rate": 6.0166155437870874e-05, "loss": 0.07036843299865722, "mean_token_accuracy": 0.9825942468643188, "num_tokens": 23047052.0, "step": 21825 }, { "entropy": 0.04418198180996114, "epoch": 1.9268281928613946, "grad_norm": 0.5078125, "learning_rate": 5.994795959753011e-05, "loss": 0.0574480676651001, "mean_token_accuracy": 0.9849589946866035, "num_tokens": 23071692.0, "step": 21850 }, { "entropy": 0.04961350689853134, "epoch": 1.929032827002359, "grad_norm": 1.203125, "learning_rate": 5.9729990640549135e-05, "loss": 0.06114984512329102, "mean_token_accuracy": 0.9833905136585236, "num_tokens": 23097756.0, "step": 21875 }, { "entropy": 0.046675025945150994, "epoch": 1.9312374611433234, "grad_norm": 0.375, "learning_rate": 5.95122498016598e-05, "loss": 0.06442777156829833, "mean_token_accuracy": 0.9827769809961319, "num_tokens": 23124268.0, "step": 21900 }, { "entropy": 0.05056103619572241, "epoch": 1.9334420952842875, "grad_norm": 0.8125, "learning_rate": 5.9294738314301743e-05, "loss": 0.06868845462799072, "mean_token_accuracy": 0.9845385247468948, "num_tokens": 23151215.0, "step": 21925 }, { "entropy": 0.048866855730157116, "epoch": 1.935646729425252, "grad_norm": 1.6171875, "learning_rate": 5.907745741061534e-05, "loss": 0.057186398506164554, "mean_token_accuracy": 0.9831150618195533, "num_tokens": 23177437.0, "step": 21950 }, { "entropy": 0.04560523602282046, "epoch": 1.9378513635662162, "grad_norm": 0.5703125, "learning_rate": 5.8860408321434846e-05, "loss": 0.061235876083374025, "mean_token_accuracy": 0.9851277622580529, "num_tokens": 23202778.0, "step": 21975 }, { "entropy": 0.04789327524813416, "epoch": 1.9400559977071805, "grad_norm": 1.984375, "learning_rate": 5.864359227628122e-05, "loss": 0.07066461086273193, "mean_token_accuracy": 0.9854995140433311, "num_tokens": 23228354.0, "step": 22000 }, { "epoch": 1.9400559977071805, "eval_entropy": 0.020465548207560823, "eval_loss": 0.024450432509183884, "eval_mean_token_accuracy": 0.9920910437376308, "eval_num_tokens": 23228354.0, "eval_runtime": 257.3468, "eval_samples_per_second": 15.267, "eval_steps_per_second": 3.82, "step": 22000 }, { "entropy": 0.03955814203254704, "epoch": 1.9422606318481448, "grad_norm": 0.9453125, "learning_rate": 5.842701050335543e-05, "loss": 0.060781092643737794, "mean_token_accuracy": 0.9853532475233078, "num_tokens": 23254352.0, "step": 22025 }, { "entropy": 0.0578464501263079, "epoch": 1.944465265989109, "grad_norm": 0.380859375, "learning_rate": 5.8210664229531295e-05, "loss": 0.08736164093017579, "mean_token_accuracy": 0.981940735578537, "num_tokens": 23281917.0, "step": 22050 }, { "entropy": 0.05473690754020936, "epoch": 1.9466699001300736, "grad_norm": 1.4296875, "learning_rate": 5.799455468034867e-05, "loss": 0.08340877532958985, "mean_token_accuracy": 0.9806632310152054, "num_tokens": 23308935.0, "step": 22075 }, { "entropy": 0.06383058197628998, "epoch": 1.9488745342710376, "grad_norm": 0.306640625, "learning_rate": 5.777868308000629e-05, "loss": 0.0989750099182129, "mean_token_accuracy": 0.9789142432808876, "num_tokens": 23336067.0, "step": 22100 }, { "entropy": 0.04984018235598341, "epoch": 1.9510791684120021, "grad_norm": 1.0546875, "learning_rate": 5.7563050651355144e-05, "loss": 0.06965320110321045, "mean_token_accuracy": 0.9844454318284989, "num_tokens": 23362163.0, "step": 22125 }, { "entropy": 0.05287686903800932, "epoch": 1.9532838025529662, "grad_norm": 1.1328125, "learning_rate": 5.73476586158912e-05, "loss": 0.05809790134429932, "mean_token_accuracy": 0.9823646634817124, "num_tokens": 23387801.0, "step": 22150 }, { "entropy": 0.04915350146053243, "epoch": 1.9554884366939307, "grad_norm": 1.03125, "learning_rate": 5.713250819374888e-05, "loss": 0.06481000423431396, "mean_token_accuracy": 0.9831967288255692, "num_tokens": 23413992.0, "step": 22175 }, { "entropy": 0.04388614729527035, "epoch": 1.957693070834895, "grad_norm": 0.79296875, "learning_rate": 5.691760060369372e-05, "loss": 0.057064995765686036, "mean_token_accuracy": 0.9855844187736511, "num_tokens": 23440892.0, "step": 22200 }, { "entropy": 0.04378307052866148, "epoch": 1.9598977049758592, "grad_norm": 2.71875, "learning_rate": 5.6702937063115844e-05, "loss": 0.06034691333770752, "mean_token_accuracy": 0.9812196576595307, "num_tokens": 23468946.0, "step": 22225 }, { "entropy": 0.042295570675632915, "epoch": 1.9621023391168235, "grad_norm": 0.828125, "learning_rate": 5.6488518788022834e-05, "loss": 0.056632590293884275, "mean_token_accuracy": 0.9866640722751617, "num_tokens": 23494432.0, "step": 22250 }, { "entropy": 0.05475355692382436, "epoch": 1.9643069732577878, "grad_norm": 2.515625, "learning_rate": 5.627434699303296e-05, "loss": 0.08935759544372558, "mean_token_accuracy": 0.9836769181489945, "num_tokens": 23521590.0, "step": 22275 }, { "entropy": 0.04196426347873057, "epoch": 1.9665116073987523, "grad_norm": 0.251953125, "learning_rate": 5.606042289136816e-05, "loss": 0.05499490737915039, "mean_token_accuracy": 0.9883558192849159, "num_tokens": 23547518.0, "step": 22300 }, { "entropy": 0.049566179718676724, "epoch": 1.9687162415397164, "grad_norm": 1.4296875, "learning_rate": 5.584674769484735e-05, "loss": 0.06642638683319092, "mean_token_accuracy": 0.9826064822077751, "num_tokens": 23573365.0, "step": 22325 }, { "entropy": 0.046303658723190894, "epoch": 1.9709208756806809, "grad_norm": 1.765625, "learning_rate": 5.563332261387946e-05, "loss": 0.0545860767364502, "mean_token_accuracy": 0.9867219117283821, "num_tokens": 23600079.0, "step": 22350 }, { "entropy": 0.05276435214473167, "epoch": 1.9731255098216451, "grad_norm": 2.15625, "learning_rate": 5.542014885745654e-05, "loss": 0.07084882259368896, "mean_token_accuracy": 0.9833718439936638, "num_tokens": 23627130.0, "step": 22375 }, { "entropy": 0.0445670667073864, "epoch": 1.9753301439626094, "grad_norm": 1.640625, "learning_rate": 5.520722763314694e-05, "loss": 0.06202894687652588, "mean_token_accuracy": 0.9840847709774971, "num_tokens": 23653384.0, "step": 22400 }, { "entropy": 0.0423390647029737, "epoch": 1.9775347781035737, "grad_norm": 0.380859375, "learning_rate": 5.4994560147088594e-05, "loss": 0.06795649528503418, "mean_token_accuracy": 0.9833470878005027, "num_tokens": 23679568.0, "step": 22425 }, { "entropy": 0.05557289769756608, "epoch": 1.979739412244538, "grad_norm": 1.6171875, "learning_rate": 5.4782147603981993e-05, "loss": 0.07008291244506835, "mean_token_accuracy": 0.9816686421632767, "num_tokens": 23706863.0, "step": 22450 }, { "entropy": 0.058572328451264186, "epoch": 1.9819440463855025, "grad_norm": 0.58203125, "learning_rate": 5.4569991207083506e-05, "loss": 0.0892257308959961, "mean_token_accuracy": 0.981511018872261, "num_tokens": 23734937.0, "step": 22475 }, { "entropy": 0.05591572977726173, "epoch": 1.9841486805264665, "grad_norm": 1.25, "learning_rate": 5.435809215819843e-05, "loss": 0.08323918342590332, "mean_token_accuracy": 0.981179955303669, "num_tokens": 23760727.0, "step": 22500 }, { "entropy": 0.048877669227076696, "epoch": 1.986353314667431, "grad_norm": 1.90625, "learning_rate": 5.4146451657674314e-05, "loss": 0.05748679637908936, "mean_token_accuracy": 0.986055271923542, "num_tokens": 23788078.0, "step": 22525 }, { "entropy": 0.03823746287758695, "epoch": 1.988557948808395, "grad_norm": 2.21875, "learning_rate": 5.393507090439409e-05, "loss": 0.05168859958648682, "mean_token_accuracy": 0.9883771386742591, "num_tokens": 23813096.0, "step": 22550 }, { "entropy": 0.053251380213696395, "epoch": 1.9907625829493596, "grad_norm": 1.59375, "learning_rate": 5.3723951095769376e-05, "loss": 0.0701138734817505, "mean_token_accuracy": 0.9776954674720764, "num_tokens": 23840476.0, "step": 22575 }, { "entropy": 0.04151842765553738, "epoch": 1.9929672170903239, "grad_norm": 2.453125, "learning_rate": 5.351309342773347e-05, "loss": 0.05058634757995605, "mean_token_accuracy": 0.9842834863066673, "num_tokens": 23866669.0, "step": 22600 }, { "entropy": 0.04889295605695224, "epoch": 1.9951718512312882, "grad_norm": 2.53125, "learning_rate": 5.330249909473477e-05, "loss": 0.062028875350952146, "mean_token_accuracy": 0.9804515436291694, "num_tokens": 23893348.0, "step": 22625 }, { "entropy": 0.04262204033504531, "epoch": 1.9973764853722524, "grad_norm": 0.80078125, "learning_rate": 5.309216928973e-05, "loss": 0.05334056854248047, "mean_token_accuracy": 0.9852747458219528, "num_tokens": 23920789.0, "step": 22650 }, { "entropy": 0.06502369482783252, "epoch": 1.9995811195132167, "grad_norm": 2.5625, "learning_rate": 5.28821052041774e-05, "loss": 0.10824305534362794, "mean_token_accuracy": 0.9763033568859101, "num_tokens": 23947393.0, "step": 22675 }, { "entropy": 0.04334155930002747, "epoch": 2.0017637073127714, "grad_norm": 1.4296875, "learning_rate": 5.2672308028029915e-05, "loss": 0.03947641611099243, "mean_token_accuracy": 0.9899958405229781, "num_tokens": 23973397.0, "step": 22700 }, { "entropy": 0.036623261850108974, "epoch": 2.003968341453736, "grad_norm": 0.82421875, "learning_rate": 5.246277894972862e-05, "loss": 0.0373115611076355, "mean_token_accuracy": 0.9904880458116532, "num_tokens": 23999702.0, "step": 22725 }, { "entropy": 0.03956556206168898, "epoch": 2.0061729755947, "grad_norm": 0.80859375, "learning_rate": 5.225351915619583e-05, "loss": 0.030584862232208253, "mean_token_accuracy": 0.9897949555516243, "num_tokens": 24025597.0, "step": 22750 }, { "entropy": 0.040053981658929844, "epoch": 2.0083776097356645, "grad_norm": 1.1953125, "learning_rate": 5.20445298328285e-05, "loss": 0.04506603240966797, "mean_token_accuracy": 0.9884122493863106, "num_tokens": 24051938.0, "step": 22775 }, { "entropy": 0.03691858461916127, "epoch": 2.0105822438766285, "grad_norm": 0.71484375, "learning_rate": 5.183581216349134e-05, "loss": 0.030272905826568604, "mean_token_accuracy": 0.9916832426190376, "num_tokens": 24077647.0, "step": 22800 }, { "entropy": 0.04003718716594449, "epoch": 2.012786878017593, "grad_norm": 0.5546875, "learning_rate": 5.162736733051035e-05, "loss": 0.039083366394042966, "mean_token_accuracy": 0.9893439787626267, "num_tokens": 24104616.0, "step": 22825 }, { "entropy": 0.03008637696853839, "epoch": 2.014991512158557, "grad_norm": 3.09375, "learning_rate": 5.1419196514665845e-05, "loss": 0.028565278053283693, "mean_token_accuracy": 0.9929461520910263, "num_tokens": 24131459.0, "step": 22850 }, { "entropy": 0.037699917218196786, "epoch": 2.0171961462995216, "grad_norm": 0.984375, "learning_rate": 5.121130089518609e-05, "loss": 0.03901580572128296, "mean_token_accuracy": 0.990917377769947, "num_tokens": 24159204.0, "step": 22875 }, { "entropy": 0.032397389931247744, "epoch": 2.019400780440486, "grad_norm": 0.2236328125, "learning_rate": 5.100368164974023e-05, "loss": 0.03438486814498901, "mean_token_accuracy": 0.991854096353054, "num_tokens": 24186110.0, "step": 22900 }, { "entropy": 0.03398451173638023, "epoch": 2.02160541458145, "grad_norm": 0.408203125, "learning_rate": 5.0796339954432e-05, "loss": 0.033078410625457765, "mean_token_accuracy": 0.9912483549118042, "num_tokens": 24212478.0, "step": 22925 }, { "entropy": 0.03318491613783408, "epoch": 2.0238100487224147, "grad_norm": 1.2265625, "learning_rate": 5.0589276983792835e-05, "loss": 0.029813830852508546, "mean_token_accuracy": 0.9912602853775024, "num_tokens": 24239204.0, "step": 22950 }, { "entropy": 0.03395519398247416, "epoch": 2.0260146828633787, "grad_norm": 0.26171875, "learning_rate": 5.0382493910775275e-05, "loss": 0.04194780826568603, "mean_token_accuracy": 0.9895057672262192, "num_tokens": 24265682.0, "step": 22975 }, { "entropy": 0.032896046323221524, "epoch": 2.028219317004343, "grad_norm": 2.375, "learning_rate": 5.0175991906746335e-05, "loss": 0.04431535720825195, "mean_token_accuracy": 0.9895771262049675, "num_tokens": 24292892.0, "step": 23000 }, { "epoch": 2.028219317004343, "eval_entropy": 0.01695111796384467, "eval_loss": 0.024288112297654152, "eval_mean_token_accuracy": 0.9923011344665186, "eval_num_tokens": 24292892.0, "eval_runtime": 267.7927, "eval_samples_per_second": 14.672, "eval_steps_per_second": 3.671, "step": 23000 }, { "entropy": 0.030041932204985643, "epoch": 2.0304239511453073, "grad_norm": 0.39453125, "learning_rate": 4.996977214148083e-05, "loss": 0.04025341033935547, "mean_token_accuracy": 0.9895694407820702, "num_tokens": 24319322.0, "step": 23025 }, { "entropy": 0.03509666084595665, "epoch": 2.0326285852862718, "grad_norm": 3.109375, "learning_rate": 4.976383578315473e-05, "loss": 0.04548058986663819, "mean_token_accuracy": 0.9889927119016647, "num_tokens": 24346498.0, "step": 23050 }, { "entropy": 0.026039416307394276, "epoch": 2.0348332194272363, "grad_norm": 1.0546875, "learning_rate": 4.955818399833868e-05, "loss": 0.026256818771362305, "mean_token_accuracy": 0.992972640991211, "num_tokens": 24370667.0, "step": 23075 }, { "entropy": 0.028871810426353478, "epoch": 2.0370378535682003, "grad_norm": 0.3984375, "learning_rate": 4.935281795199128e-05, "loss": 0.034981093406677245, "mean_token_accuracy": 0.9913325354456901, "num_tokens": 24396592.0, "step": 23100 }, { "entropy": 0.028396220860795438, "epoch": 2.039242487709165, "grad_norm": 0.306640625, "learning_rate": 4.914773880745241e-05, "loss": 0.03216857671737671, "mean_token_accuracy": 0.9912496462464333, "num_tokens": 24423211.0, "step": 23125 }, { "entropy": 0.028173443265914103, "epoch": 2.041447121850129, "grad_norm": 1.21875, "learning_rate": 4.894294772643684e-05, "loss": 0.032784721851348876, "mean_token_accuracy": 0.9923763892054558, "num_tokens": 24449053.0, "step": 23150 }, { "entropy": 0.0323675285151694, "epoch": 2.0436517559910934, "grad_norm": 0.73828125, "learning_rate": 4.87384458690275e-05, "loss": 0.03548475980758667, "mean_token_accuracy": 0.9910583171248436, "num_tokens": 24476450.0, "step": 23175 }, { "entropy": 0.029555844330679974, "epoch": 2.0458563901320574, "grad_norm": 0.9609375, "learning_rate": 4.8534234393669e-05, "loss": 0.03878538608551026, "mean_token_accuracy": 0.9860998558998108, "num_tokens": 24502630.0, "step": 23200 }, { "entropy": 0.025517641233018366, "epoch": 2.048061024273022, "grad_norm": 1.890625, "learning_rate": 4.8330314457160885e-05, "loss": 0.029563398361206056, "mean_token_accuracy": 0.9919531518220901, "num_tokens": 24528314.0, "step": 23225 }, { "entropy": 0.02861691084319318, "epoch": 2.050265658413986, "grad_norm": 1.265625, "learning_rate": 4.81266872146514e-05, "loss": 0.03183458566665649, "mean_token_accuracy": 0.992757821381092, "num_tokens": 24555267.0, "step": 23250 }, { "entropy": 0.02771842215915967, "epoch": 2.0524702925549505, "grad_norm": 1.078125, "learning_rate": 4.7923353819630565e-05, "loss": 0.030379624366760255, "mean_token_accuracy": 0.9919179257750511, "num_tokens": 24581529.0, "step": 23275 }, { "entropy": 0.03345744485504838, "epoch": 2.054674926695915, "grad_norm": 0.099609375, "learning_rate": 4.7720315423924024e-05, "loss": 0.0444830846786499, "mean_token_accuracy": 0.9900724491477013, "num_tokens": 24608673.0, "step": 23300 }, { "entropy": 0.029868014961575682, "epoch": 2.056879560836879, "grad_norm": 1.0546875, "learning_rate": 4.751757317768618e-05, "loss": 0.04446365356445312, "mean_token_accuracy": 0.989599525630474, "num_tokens": 24634789.0, "step": 23325 }, { "entropy": 0.031795623505458934, "epoch": 2.0590841949778436, "grad_norm": 1.40625, "learning_rate": 4.7315128229393944e-05, "loss": 0.037621107101440426, "mean_token_accuracy": 0.9905502396821976, "num_tokens": 24660928.0, "step": 23350 }, { "entropy": 0.03209367050054425, "epoch": 2.0612888291188076, "grad_norm": 0.462890625, "learning_rate": 4.7112981725840065e-05, "loss": 0.031616337299346924, "mean_token_accuracy": 0.9918192434310913, "num_tokens": 24686594.0, "step": 23375 }, { "entropy": 0.035774395614826066, "epoch": 2.063493463259772, "grad_norm": 0.5078125, "learning_rate": 4.6911134812126745e-05, "loss": 0.04031749248504639, "mean_token_accuracy": 0.9876101958751679, "num_tokens": 24713766.0, "step": 23400 }, { "entropy": 0.024187013619557548, "epoch": 2.065698097400736, "grad_norm": 0.271484375, "learning_rate": 4.6709588631658975e-05, "loss": 0.029099743366241455, "mean_token_accuracy": 0.992880313694477, "num_tokens": 24739687.0, "step": 23425 }, { "entropy": 0.02847926665253908, "epoch": 2.0679027315417007, "grad_norm": 0.703125, "learning_rate": 4.650834432613829e-05, "loss": 0.03781245231628418, "mean_token_accuracy": 0.991992236673832, "num_tokens": 24765307.0, "step": 23450 }, { "entropy": 0.025070481478396685, "epoch": 2.0701073656826647, "grad_norm": 0.859375, "learning_rate": 4.63074030355562e-05, "loss": 0.02944427490234375, "mean_token_accuracy": 0.9928121709823609, "num_tokens": 24790905.0, "step": 23475 }, { "entropy": 0.03197751844145387, "epoch": 2.0723119998236292, "grad_norm": 1.015625, "learning_rate": 4.610676589818763e-05, "loss": 0.031009118556976318, "mean_token_accuracy": 0.9915920281410218, "num_tokens": 24816043.0, "step": 23500 }, { "entropy": 0.02905171564543707, "epoch": 2.0745166339645937, "grad_norm": 2.1875, "learning_rate": 4.590643405058458e-05, "loss": 0.031951904296875, "mean_token_accuracy": 0.9911505779623986, "num_tokens": 24842440.0, "step": 23525 }, { "entropy": 0.02454348152547027, "epoch": 2.076721268105558, "grad_norm": 0.55078125, "learning_rate": 4.570640862756973e-05, "loss": 0.029993116855621338, "mean_token_accuracy": 0.9923299300670624, "num_tokens": 24868553.0, "step": 23550 }, { "entropy": 0.027761456568005088, "epoch": 2.0789259022465223, "grad_norm": 0.82421875, "learning_rate": 4.550669076222992e-05, "loss": 0.03329916954040527, "mean_token_accuracy": 0.9906358134746551, "num_tokens": 24894867.0, "step": 23575 }, { "entropy": 0.03804819417771796, "epoch": 2.0811305363874864, "grad_norm": 1.9453125, "learning_rate": 4.530728158590982e-05, "loss": 0.04649081707000732, "mean_token_accuracy": 0.9858838346600532, "num_tokens": 24922116.0, "step": 23600 }, { "entropy": 0.029030120181087114, "epoch": 2.083335170528451, "grad_norm": 0.455078125, "learning_rate": 4.510818222820533e-05, "loss": 0.03388277530670166, "mean_token_accuracy": 0.992407936155796, "num_tokens": 24948516.0, "step": 23625 }, { "entropy": 0.02953952599247714, "epoch": 2.085539804669415, "grad_norm": 0.734375, "learning_rate": 4.490939381695743e-05, "loss": 0.03897066593170166, "mean_token_accuracy": 0.990988989174366, "num_tokens": 24974732.0, "step": 23650 }, { "entropy": 0.030306367558223427, "epoch": 2.0877444388103794, "grad_norm": 0.6875, "learning_rate": 4.471091747824566e-05, "loss": 0.03767595291137695, "mean_token_accuracy": 0.9907210981845855, "num_tokens": 25001192.0, "step": 23675 }, { "entropy": 0.03219981058784469, "epoch": 2.089949072951344, "grad_norm": 0.474609375, "learning_rate": 4.451275433638176e-05, "loss": 0.03162062644958496, "mean_token_accuracy": 0.991564072072506, "num_tokens": 25026352.0, "step": 23700 }, { "entropy": 0.02597928559704087, "epoch": 2.092153707092308, "grad_norm": 0.259765625, "learning_rate": 4.431490551390326e-05, "loss": 0.03252000331878662, "mean_token_accuracy": 0.9911320436000824, "num_tokens": 25052073.0, "step": 23725 }, { "entropy": 0.032203564375049606, "epoch": 2.0943583412332725, "grad_norm": 0.55078125, "learning_rate": 4.4117372131567136e-05, "loss": 0.04490713119506836, "mean_token_accuracy": 0.9891028618812561, "num_tokens": 25077135.0, "step": 23750 }, { "entropy": 0.028571351964310453, "epoch": 2.0965629753742365, "grad_norm": 0.265625, "learning_rate": 4.392015530834358e-05, "loss": 0.02833454132080078, "mean_token_accuracy": 0.992771886587143, "num_tokens": 25104636.0, "step": 23775 }, { "entropy": 0.03307085748732788, "epoch": 2.098767609515201, "grad_norm": 1.3046875, "learning_rate": 4.372325616140954e-05, "loss": 0.0332148814201355, "mean_token_accuracy": 0.9900698167085648, "num_tokens": 25132472.0, "step": 23800 }, { "entropy": 0.030112269805176766, "epoch": 2.100972243656165, "grad_norm": 0.5390625, "learning_rate": 4.352667580614236e-05, "loss": 0.03317270040512085, "mean_token_accuracy": 0.9916463688015937, "num_tokens": 25158834.0, "step": 23825 }, { "entropy": 0.031288266634292086, "epoch": 2.1031768777971296, "grad_norm": 4.5, "learning_rate": 4.3330415356113565e-05, "loss": 0.03746793270111084, "mean_token_accuracy": 0.9905204650759697, "num_tokens": 25186663.0, "step": 23850 }, { "entropy": 0.030189171290840023, "epoch": 2.105381511938094, "grad_norm": 0.373046875, "learning_rate": 4.313447592308251e-05, "loss": 0.03885282039642334, "mean_token_accuracy": 0.9899864155054092, "num_tokens": 25213550.0, "step": 23875 }, { "entropy": 0.026368075483023858, "epoch": 2.107586146079058, "grad_norm": 0.5625, "learning_rate": 4.293885861699011e-05, "loss": 0.026547930240631103, "mean_token_accuracy": 0.9932371464371681, "num_tokens": 25238404.0, "step": 23900 }, { "entropy": 0.03681902970789452, "epoch": 2.1097907802200226, "grad_norm": 0.81640625, "learning_rate": 4.2743564545952406e-05, "loss": 0.04580854892730713, "mean_token_accuracy": 0.9896917739510536, "num_tokens": 25265556.0, "step": 23925 }, { "entropy": 0.026895671052625404, "epoch": 2.1119954143609867, "grad_norm": 0.16015625, "learning_rate": 4.2548594816254573e-05, "loss": 0.030972492694854737, "mean_token_accuracy": 0.9919655025005341, "num_tokens": 25292155.0, "step": 23950 }, { "entropy": 0.0291110172522167, "epoch": 2.114200048501951, "grad_norm": 0.361328125, "learning_rate": 4.235395053234431e-05, "loss": 0.03769398212432861, "mean_token_accuracy": 0.9900598734617233, "num_tokens": 25318586.0, "step": 23975 }, { "entropy": 0.023451846447514982, "epoch": 2.1164046826429153, "grad_norm": 1.1328125, "learning_rate": 4.21596327968259e-05, "loss": 0.0197164249420166, "mean_token_accuracy": 0.9938502493500709, "num_tokens": 25343799.0, "step": 24000 }, { "epoch": 2.1164046826429153, "eval_entropy": 0.01599665907487422, "eval_loss": 0.02462713047862053, "eval_mean_token_accuracy": 0.9924976906965611, "eval_num_tokens": 25343799.0, "eval_runtime": 235.5413, "eval_samples_per_second": 16.681, "eval_steps_per_second": 4.173, "step": 24000 }, { "entropy": 0.03335717603546073, "epoch": 2.1186093167838798, "grad_norm": 4.46875, "learning_rate": 4.196564271045379e-05, "loss": 0.0459407377243042, "mean_token_accuracy": 0.9879160994291305, "num_tokens": 25371576.0, "step": 24025 }, { "entropy": 0.028599485470440413, "epoch": 2.120813950924844, "grad_norm": 2.578125, "learning_rate": 4.177198137212629e-05, "loss": 0.03392246723175049, "mean_token_accuracy": 0.9923342031240463, "num_tokens": 25398046.0, "step": 24050 }, { "entropy": 0.030389589803635316, "epoch": 2.1230185850658083, "grad_norm": 0.95703125, "learning_rate": 4.157864987887957e-05, "loss": 0.032247264385223386, "mean_token_accuracy": 0.992058128118515, "num_tokens": 25424355.0, "step": 24075 }, { "entropy": 0.0266408162328662, "epoch": 2.125223219206773, "grad_norm": 1.53125, "learning_rate": 4.138564932588126e-05, "loss": 0.0338702917098999, "mean_token_accuracy": 0.9901499152183533, "num_tokens": 25450854.0, "step": 24100 }, { "entropy": 0.02991066069038425, "epoch": 2.127427853347737, "grad_norm": 0.58203125, "learning_rate": 4.1192980806424374e-05, "loss": 0.036460573673248294, "mean_token_accuracy": 0.9879888358712197, "num_tokens": 25476457.0, "step": 24125 }, { "entropy": 0.029010038101114334, "epoch": 2.1296324874887014, "grad_norm": 0.6796875, "learning_rate": 4.100064541192092e-05, "loss": 0.030922062397003174, "mean_token_accuracy": 0.9916366341710091, "num_tokens": 25502303.0, "step": 24150 }, { "entropy": 0.03539934004222232, "epoch": 2.1318371216296654, "grad_norm": 3.765625, "learning_rate": 4.080864423189601e-05, "loss": 0.04334574222564697, "mean_token_accuracy": 0.9894258263707161, "num_tokens": 25528695.0, "step": 24175 }, { "entropy": 0.03038750326306399, "epoch": 2.13404175577063, "grad_norm": 0.259765625, "learning_rate": 4.061697835398136e-05, "loss": 0.03719171047210693, "mean_token_accuracy": 0.989592821598053, "num_tokens": 25555501.0, "step": 24200 }, { "entropy": 0.027374957603205984, "epoch": 2.136246389911594, "grad_norm": 0.515625, "learning_rate": 4.042564886390946e-05, "loss": 0.02741792917251587, "mean_token_accuracy": 0.9915632554888725, "num_tokens": 25582365.0, "step": 24225 }, { "entropy": 0.03261903412229003, "epoch": 2.1384510240525585, "grad_norm": 0.427734375, "learning_rate": 4.023465684550709e-05, "loss": 0.03499001502990723, "mean_token_accuracy": 0.9893913465738297, "num_tokens": 25608433.0, "step": 24250 }, { "entropy": 0.03213590253819348, "epoch": 2.140655658193523, "grad_norm": 1.0703125, "learning_rate": 4.00440033806895e-05, "loss": 0.050077261924743655, "mean_token_accuracy": 0.9888369315862655, "num_tokens": 25636119.0, "step": 24275 }, { "entropy": 0.03385305298801541, "epoch": 2.142860292334487, "grad_norm": 1.515625, "learning_rate": 3.985368954945404e-05, "loss": 0.04040426731109619, "mean_token_accuracy": 0.9899595540761947, "num_tokens": 25662083.0, "step": 24300 }, { "entropy": 0.028466418773750776, "epoch": 2.1450649264754515, "grad_norm": 0.71875, "learning_rate": 3.966371642987423e-05, "loss": 0.03010768175125122, "mean_token_accuracy": 0.9902914983034133, "num_tokens": 25688871.0, "step": 24325 }, { "entropy": 0.02227822571494471, "epoch": 2.1472695606164156, "grad_norm": 2.109375, "learning_rate": 3.9474085098093396e-05, "loss": 0.02533080816268921, "mean_token_accuracy": 0.9931004998087883, "num_tokens": 25715189.0, "step": 24350 }, { "entropy": 0.03268660336951143, "epoch": 2.14947419475738, "grad_norm": 0.5078125, "learning_rate": 3.928479662831885e-05, "loss": 0.046119885444641115, "mean_token_accuracy": 0.9887044957280159, "num_tokens": 25741862.0, "step": 24375 }, { "entropy": 0.02860573635873152, "epoch": 2.151678828898344, "grad_norm": 1.8203125, "learning_rate": 3.909585209281573e-05, "loss": 0.03543957471847534, "mean_token_accuracy": 0.9902654913067818, "num_tokens": 25767683.0, "step": 24400 }, { "entropy": 0.02657259469760902, "epoch": 2.1538834630393087, "grad_norm": 2.03125, "learning_rate": 3.8907252561900774e-05, "loss": 0.03090388059616089, "mean_token_accuracy": 0.9917852020263672, "num_tokens": 25793941.0, "step": 24425 }, { "entropy": 0.02819765280069987, "epoch": 2.156088097180273, "grad_norm": 1.25, "learning_rate": 3.871899910393636e-05, "loss": 0.032193429470062256, "mean_token_accuracy": 0.9921908810734749, "num_tokens": 25820598.0, "step": 24450 }, { "entropy": 0.028375080437945142, "epoch": 2.1582927313212372, "grad_norm": 0.9921875, "learning_rate": 3.853109278532456e-05, "loss": 0.034534347057342527, "mean_token_accuracy": 0.9907797083258629, "num_tokens": 25846699.0, "step": 24475 }, { "entropy": 0.033441547485308545, "epoch": 2.1604973654622017, "grad_norm": 0.85546875, "learning_rate": 3.834353467050096e-05, "loss": 0.03380261182785034, "mean_token_accuracy": 0.9893592429161072, "num_tokens": 25873526.0, "step": 24500 }, { "entropy": 0.028523832484534068, "epoch": 2.162701999603166, "grad_norm": 0.7109375, "learning_rate": 3.8156325821928694e-05, "loss": 0.03483229398727417, "mean_token_accuracy": 0.9894470557570457, "num_tokens": 25899950.0, "step": 24525 }, { "entropy": 0.02734538020009495, "epoch": 2.1649066337441303, "grad_norm": 2.328125, "learning_rate": 3.796946730009232e-05, "loss": 0.03180067300796509, "mean_token_accuracy": 0.9929764324426651, "num_tokens": 25926135.0, "step": 24550 }, { "entropy": 0.02623340411500976, "epoch": 2.1671112678850943, "grad_norm": 0.65234375, "learning_rate": 3.778296016349195e-05, "loss": 0.036533083915710446, "mean_token_accuracy": 0.9917810225486755, "num_tokens": 25951889.0, "step": 24575 }, { "entropy": 0.030956792831748316, "epoch": 2.169315902026059, "grad_norm": 0.49609375, "learning_rate": 3.759680546863724e-05, "loss": 0.033663554191589354, "mean_token_accuracy": 0.990970625281334, "num_tokens": 25977911.0, "step": 24600 }, { "entropy": 0.025135348588664782, "epoch": 2.171520536167023, "grad_norm": 0.59375, "learning_rate": 3.7411004270041336e-05, "loss": 0.02640949010848999, "mean_token_accuracy": 0.9934472215175628, "num_tokens": 26004056.0, "step": 24625 }, { "entropy": 0.026295318936572585, "epoch": 2.1737251703079874, "grad_norm": 0.11376953125, "learning_rate": 3.722555762021489e-05, "loss": 0.03466712474822998, "mean_token_accuracy": 0.992102455496788, "num_tokens": 26030865.0, "step": 24650 }, { "entropy": 0.025986581510624093, "epoch": 2.175929804448952, "grad_norm": 0.10400390625, "learning_rate": 3.7040466569660115e-05, "loss": 0.02724365234375, "mean_token_accuracy": 0.9930882236361503, "num_tokens": 26056914.0, "step": 24675 }, { "entropy": 0.03014488375159999, "epoch": 2.178134438589916, "grad_norm": 1.171875, "learning_rate": 3.685573216686494e-05, "loss": 0.03400787115097046, "mean_token_accuracy": 0.9910931748151779, "num_tokens": 26082253.0, "step": 24700 }, { "entropy": 0.04059045683388831, "epoch": 2.1803390727308805, "grad_norm": 0.6015625, "learning_rate": 3.6671355458296994e-05, "loss": 0.05361992835998535, "mean_token_accuracy": 0.9899610930681229, "num_tokens": 26109736.0, "step": 24725 }, { "entropy": 0.021792225182471158, "epoch": 2.1825437068718445, "grad_norm": 0.1630859375, "learning_rate": 3.648733748839756e-05, "loss": 0.024166872501373293, "mean_token_accuracy": 0.9929477843642235, "num_tokens": 26134972.0, "step": 24750 }, { "entropy": 0.02890670511445933, "epoch": 2.184748341012809, "grad_norm": 1.0, "learning_rate": 3.6303679299575853e-05, "loss": 0.037716662883758544, "mean_token_accuracy": 0.9917557543516159, "num_tokens": 26162082.0, "step": 24775 }, { "entropy": 0.03442299145397556, "epoch": 2.186952975153773, "grad_norm": 1.8828125, "learning_rate": 3.612038193220302e-05, "loss": 0.0406611967086792, "mean_token_accuracy": 0.9893470558524132, "num_tokens": 26189379.0, "step": 24800 }, { "entropy": 0.030212515026360052, "epoch": 2.1891576092947376, "grad_norm": 0.388671875, "learning_rate": 3.593744642460629e-05, "loss": 0.02927395820617676, "mean_token_accuracy": 0.9920516067743301, "num_tokens": 26215772.0, "step": 24825 }, { "entropy": 0.037156562910895446, "epoch": 2.1913622434357016, "grad_norm": 1.984375, "learning_rate": 3.575487381306296e-05, "loss": 0.051420702934265136, "mean_token_accuracy": 0.9874176776409149, "num_tokens": 26243037.0, "step": 24850 }, { "entropy": 0.028511690973427904, "epoch": 2.193566877576666, "grad_norm": 0.08984375, "learning_rate": 3.557266513179474e-05, "loss": 0.04477667808532715, "mean_token_accuracy": 0.9889134570956231, "num_tokens": 26269706.0, "step": 24875 }, { "entropy": 0.03325704227485403, "epoch": 2.1957715117176306, "grad_norm": 1.2734375, "learning_rate": 3.539082141296164e-05, "loss": 0.03619353771209717, "mean_token_accuracy": 0.9900521844625473, "num_tokens": 26296027.0, "step": 24900 }, { "entropy": 0.03061635433907213, "epoch": 2.1979761458585947, "grad_norm": 0.58203125, "learning_rate": 3.520934368665641e-05, "loss": 0.0380321478843689, "mean_token_accuracy": 0.9908155652880669, "num_tokens": 26323156.0, "step": 24925 }, { "entropy": 0.030784137591181208, "epoch": 2.200180779999559, "grad_norm": 1.5, "learning_rate": 3.502823298089852e-05, "loss": 0.034097914695739744, "mean_token_accuracy": 0.9906668230891228, "num_tokens": 26349731.0, "step": 24950 }, { "entropy": 0.027966852709814704, "epoch": 2.2023854141405232, "grad_norm": 1.171875, "learning_rate": 3.4847490321628284e-05, "loss": 0.03110057830810547, "mean_token_accuracy": 0.9913800299167633, "num_tokens": 26376290.0, "step": 24975 }, { "entropy": 0.03734282299010374, "epoch": 2.2045900482814877, "grad_norm": 0.271484375, "learning_rate": 3.466711673270121e-05, "loss": 0.04846214771270752, "mean_token_accuracy": 0.9890136790275573, "num_tokens": 26402644.0, "step": 25000 }, { "epoch": 2.2045900482814877, "eval_entropy": 0.015888335370261596, "eval_loss": 0.024382170289754868, "eval_mean_token_accuracy": 0.9926599772876301, "eval_num_tokens": 26402644.0, "eval_runtime": 226.5246, "eval_samples_per_second": 17.345, "eval_steps_per_second": 4.339, "step": 25000 }, { "entropy": 0.03275453881615249, "epoch": 2.206794682422452, "grad_norm": 3.71875, "learning_rate": 3.448711323588214e-05, "loss": 0.03879266977310181, "mean_token_accuracy": 0.9901682394742966, "num_tokens": 26429152.0, "step": 25025 }, { "entropy": 0.02864964444357611, "epoch": 2.2089993165634163, "grad_norm": 1.4375, "learning_rate": 3.4307480850839454e-05, "loss": 0.03968175888061523, "mean_token_accuracy": 0.9913057947158813, "num_tokens": 26455036.0, "step": 25050 }, { "entropy": 0.02369352693334804, "epoch": 2.211203950704381, "grad_norm": 0.83984375, "learning_rate": 3.4128220595139204e-05, "loss": 0.025962200164794922, "mean_token_accuracy": 0.9940554338693619, "num_tokens": 26479705.0, "step": 25075 }, { "entropy": 0.028369134187960297, "epoch": 2.213408584845345, "grad_norm": 1.2109375, "learning_rate": 3.394933348423957e-05, "loss": 0.031112048625946045, "mean_token_accuracy": 0.9922392535209655, "num_tokens": 26506194.0, "step": 25100 }, { "entropy": 0.02832837748061138, "epoch": 2.2156132189863094, "grad_norm": 0.390625, "learning_rate": 3.37708205314848e-05, "loss": 0.031320822238922116, "mean_token_accuracy": 0.9913709491491318, "num_tokens": 26532032.0, "step": 25125 }, { "entropy": 0.023645088316334294, "epoch": 2.2178178531272734, "grad_norm": 4.0, "learning_rate": 3.359268274809984e-05, "loss": 0.027528271675109864, "mean_token_accuracy": 0.9919891226291656, "num_tokens": 26557094.0, "step": 25150 }, { "entropy": 0.027313509808845993, "epoch": 2.220022487268238, "grad_norm": 0.67578125, "learning_rate": 3.341492114318424e-05, "loss": 0.030699195861816405, "mean_token_accuracy": 0.9918340718746186, "num_tokens": 26583081.0, "step": 25175 }, { "entropy": 0.02981164409811754, "epoch": 2.222227121409202, "grad_norm": 1.2265625, "learning_rate": 3.3237536723706705e-05, "loss": 0.0402683687210083, "mean_token_accuracy": 0.9895758175849915, "num_tokens": 26609902.0, "step": 25200 }, { "entropy": 0.02851083199977438, "epoch": 2.2244317555501665, "grad_norm": 1.59375, "learning_rate": 3.306053049449927e-05, "loss": 0.03584902763366699, "mean_token_accuracy": 0.9909912210702896, "num_tokens": 26636633.0, "step": 25225 }, { "entropy": 0.02590751436029677, "epoch": 2.226636389691131, "grad_norm": 1.3984375, "learning_rate": 3.2883903458251655e-05, "loss": 0.02497697114944458, "mean_token_accuracy": 0.9933020269870758, "num_tokens": 26662757.0, "step": 25250 }, { "entropy": 0.025013647765863425, "epoch": 2.228841023832095, "grad_norm": 2.140625, "learning_rate": 3.270765661550547e-05, "loss": 0.027597415447235107, "mean_token_accuracy": 0.9925726521015167, "num_tokens": 26687762.0, "step": 25275 }, { "entropy": 0.028358331051313145, "epoch": 2.2310456579730595, "grad_norm": 2.296875, "learning_rate": 3.253179096464874e-05, "loss": 0.03725262403488159, "mean_token_accuracy": 0.9898791015148163, "num_tokens": 26713382.0, "step": 25300 }, { "entropy": 0.02306008890576777, "epoch": 2.2332502921140236, "grad_norm": 0.31640625, "learning_rate": 3.235630750191008e-05, "loss": 0.024607329368591307, "mean_token_accuracy": 0.992639516890049, "num_tokens": 26738687.0, "step": 25325 }, { "entropy": 0.03018416144524963, "epoch": 2.235454926254988, "grad_norm": 1.6328125, "learning_rate": 3.2181207221353184e-05, "loss": 0.03323997020721436, "mean_token_accuracy": 0.990810919702053, "num_tokens": 26766617.0, "step": 25350 }, { "entropy": 0.03350815741945553, "epoch": 2.237659560395952, "grad_norm": 2.3125, "learning_rate": 3.200649111487102e-05, "loss": 0.046819000244140624, "mean_token_accuracy": 0.988080404996872, "num_tokens": 26793542.0, "step": 25375 }, { "entropy": 0.02414072970594134, "epoch": 2.2398641945369167, "grad_norm": 0.1884765625, "learning_rate": 3.1832160172180426e-05, "loss": 0.0301043438911438, "mean_token_accuracy": 0.9919259274005889, "num_tokens": 26819952.0, "step": 25400 }, { "entropy": 0.03012874484673375, "epoch": 2.2420688286778807, "grad_norm": 1.453125, "learning_rate": 3.165821538081637e-05, "loss": 0.03285449743270874, "mean_token_accuracy": 0.9908521872758865, "num_tokens": 26845260.0, "step": 25425 }, { "entropy": 0.031216552823862, "epoch": 2.244273462818845, "grad_norm": 1.421875, "learning_rate": 3.148465772612639e-05, "loss": 0.03792816877365112, "mean_token_accuracy": 0.9898707485198974, "num_tokens": 26872952.0, "step": 25450 }, { "entropy": 0.02386282076506177, "epoch": 2.2464780969598097, "grad_norm": 0.3359375, "learning_rate": 3.1311488191264926e-05, "loss": 0.026478643417358397, "mean_token_accuracy": 0.9940714892745018, "num_tokens": 26897729.0, "step": 25475 }, { "entropy": 0.03102828201666853, "epoch": 2.2486827311007738, "grad_norm": 1.828125, "learning_rate": 3.1138707757187925e-05, "loss": 0.04361213684082031, "mean_token_accuracy": 0.9898296114802361, "num_tokens": 26924394.0, "step": 25500 }, { "entropy": 0.025993381780681377, "epoch": 2.2508873652417383, "grad_norm": 0.349609375, "learning_rate": 3.096631740264718e-05, "loss": 0.023462820053100585, "mean_token_accuracy": 0.9935817018151283, "num_tokens": 26950337.0, "step": 25525 }, { "entropy": 0.027647298633964965, "epoch": 2.2530919993827023, "grad_norm": 1.7265625, "learning_rate": 3.079431810418473e-05, "loss": 0.03757109642028809, "mean_token_accuracy": 0.9909259453415871, "num_tokens": 26975811.0, "step": 25550 }, { "entropy": 0.028983663184444595, "epoch": 2.255296633523667, "grad_norm": 0.94140625, "learning_rate": 3.0622710836127474e-05, "loss": 0.03265504121780396, "mean_token_accuracy": 0.9918881016969681, "num_tokens": 27002457.0, "step": 25575 }, { "entropy": 0.02929145427147887, "epoch": 2.257501267664631, "grad_norm": 1.578125, "learning_rate": 3.0451496570581482e-05, "loss": 0.036674625873565674, "mean_token_accuracy": 0.9913082128763199, "num_tokens": 27029283.0, "step": 25600 }, { "entropy": 0.0296330451936592, "epoch": 2.2597059018055954, "grad_norm": 0.8359375, "learning_rate": 3.0280676277426655e-05, "loss": 0.038873662948608396, "mean_token_accuracy": 0.9899021616578102, "num_tokens": 27055528.0, "step": 25625 }, { "entropy": 0.022964343342828215, "epoch": 2.2619105359465594, "grad_norm": 0.058349609375, "learning_rate": 3.0110250924311157e-05, "loss": 0.020480175018310547, "mean_token_accuracy": 0.9950110822916031, "num_tokens": 27081049.0, "step": 25650 }, { "entropy": 0.02799141699797474, "epoch": 2.264115170087524, "grad_norm": 1.3828125, "learning_rate": 2.9940221476645815e-05, "loss": 0.031359810829162595, "mean_token_accuracy": 0.9911579310894012, "num_tokens": 27107201.0, "step": 25675 }, { "entropy": 0.02933084479354875, "epoch": 2.2663198042284884, "grad_norm": 1.8203125, "learning_rate": 2.9770588897598893e-05, "loss": 0.03271200656890869, "mean_token_accuracy": 0.9901346156001091, "num_tokens": 27134550.0, "step": 25700 }, { "entropy": 0.028671488952568325, "epoch": 2.2685244383694525, "grad_norm": 0.546875, "learning_rate": 2.9601354148090465e-05, "loss": 0.035420951843261717, "mean_token_accuracy": 0.9904685345292091, "num_tokens": 27161406.0, "step": 25725 }, { "entropy": 0.02534457077494153, "epoch": 2.270729072510417, "grad_norm": 0.7890625, "learning_rate": 2.943251818678704e-05, "loss": 0.028134129047393798, "mean_token_accuracy": 0.991405982375145, "num_tokens": 27187953.0, "step": 25750 }, { "entropy": 0.03393531416601036, "epoch": 2.272933706651381, "grad_norm": 0.451171875, "learning_rate": 2.9264081970096034e-05, "loss": 0.05243350982666015, "mean_token_accuracy": 0.9875575861334801, "num_tokens": 27215332.0, "step": 25775 }, { "entropy": 0.023412532548227317, "epoch": 2.2751383407923456, "grad_norm": 0.26171875, "learning_rate": 2.909604645216045e-05, "loss": 0.029395694732666015, "mean_token_accuracy": 0.9933974233269691, "num_tokens": 27241193.0, "step": 25800 }, { "entropy": 0.027852850720009883, "epoch": 2.27734297493331, "grad_norm": 1.09375, "learning_rate": 2.8928412584853494e-05, "loss": 0.034067320823669436, "mean_token_accuracy": 0.9859316512942314, "num_tokens": 27268202.0, "step": 25825 }, { "entropy": 0.02478776458385255, "epoch": 2.279547609074274, "grad_norm": 0.8984375, "learning_rate": 2.876118131777311e-05, "loss": 0.031112759113311766, "mean_token_accuracy": 0.9908994352817535, "num_tokens": 27293667.0, "step": 25850 }, { "entropy": 0.020712875399785845, "epoch": 2.2817522432152386, "grad_norm": 2.28125, "learning_rate": 2.859435359823659e-05, "loss": 0.02065124988555908, "mean_token_accuracy": 0.9933943581581116, "num_tokens": 27318458.0, "step": 25875 }, { "entropy": 0.024171373999743083, "epoch": 2.2839568773562027, "grad_norm": 0.703125, "learning_rate": 2.84279303712753e-05, "loss": 0.02398313283920288, "mean_token_accuracy": 0.9932537263631821, "num_tokens": 27343651.0, "step": 25900 }, { "entropy": 0.03080252897207174, "epoch": 2.286161511497167, "grad_norm": 0.6171875, "learning_rate": 2.8261912579629248e-05, "loss": 0.03876256704330444, "mean_token_accuracy": 0.9887584137916565, "num_tokens": 27371576.0, "step": 25925 }, { "entropy": 0.030703878802341933, "epoch": 2.2883661456381312, "grad_norm": 0.57421875, "learning_rate": 2.8096301163741755e-05, "loss": 0.03663143157958984, "mean_token_accuracy": 0.9901211553812027, "num_tokens": 27398292.0, "step": 25950 }, { "entropy": 0.031650141541394984, "epoch": 2.2905707797790957, "grad_norm": 1.2734375, "learning_rate": 2.7931097061754197e-05, "loss": 0.04376762390136719, "mean_token_accuracy": 0.9910077887773514, "num_tokens": 27425104.0, "step": 25975 }, { "entropy": 0.029165671150476555, "epoch": 2.29277541392006, "grad_norm": 0.5703125, "learning_rate": 2.7766301209500543e-05, "loss": 0.03497497081756592, "mean_token_accuracy": 0.9903140386939049, "num_tokens": 27451361.0, "step": 26000 }, { "epoch": 2.29277541392006, "eval_entropy": 0.015544790721468586, "eval_loss": 0.024325313046574593, "eval_mean_token_accuracy": 0.9927062318865911, "eval_num_tokens": 27451361.0, "eval_runtime": 226.2629, "eval_samples_per_second": 17.365, "eval_steps_per_second": 4.345, "step": 26000 }, { "entropy": 0.03006629309566051, "epoch": 2.2949800480610243, "grad_norm": 0.62109375, "learning_rate": 2.7601914540502172e-05, "loss": 0.03512540340423584, "mean_token_accuracy": 0.9910671037435531, "num_tokens": 27478307.0, "step": 26025 }, { "entropy": 0.023125871705269675, "epoch": 2.297184682201989, "grad_norm": 0.3828125, "learning_rate": 2.743793798596259e-05, "loss": 0.02564392566680908, "mean_token_accuracy": 0.9933660838007927, "num_tokens": 27503140.0, "step": 26050 }, { "entropy": 0.030678539160126094, "epoch": 2.299389316342953, "grad_norm": 1.953125, "learning_rate": 2.7274372474762154e-05, "loss": 0.03678859710693359, "mean_token_accuracy": 0.9891363000869751, "num_tokens": 27529219.0, "step": 26075 }, { "entropy": 0.028060704695126334, "epoch": 2.3015939504839174, "grad_norm": 2.4375, "learning_rate": 2.7111218933452654e-05, "loss": 0.027757613658905028, "mean_token_accuracy": 0.9933583897352218, "num_tokens": 27554880.0, "step": 26100 }, { "entropy": 0.03153505565591331, "epoch": 2.3037985846248814, "grad_norm": 1.6484375, "learning_rate": 2.694847828625229e-05, "loss": 0.04007414340972901, "mean_token_accuracy": 0.9893001061677933, "num_tokens": 27582143.0, "step": 26125 }, { "entropy": 0.03299531255826878, "epoch": 2.306003218765846, "grad_norm": 2.203125, "learning_rate": 2.678615145504032e-05, "loss": 0.0395844316482544, "mean_token_accuracy": 0.9888230460882187, "num_tokens": 27609956.0, "step": 26150 }, { "entropy": 0.03849572095681651, "epoch": 2.30820785290681, "grad_norm": 1.265625, "learning_rate": 2.6624239359351856e-05, "loss": 0.05094009876251221, "mean_token_accuracy": 0.9879861554503441, "num_tokens": 27637690.0, "step": 26175 }, { "entropy": 0.02834760202822508, "epoch": 2.3104124870477745, "grad_norm": 0.7890625, "learning_rate": 2.6462742916372597e-05, "loss": 0.03409828186035156, "mean_token_accuracy": 0.9913964113593101, "num_tokens": 27664105.0, "step": 26200 }, { "entropy": 0.024753890240317558, "epoch": 2.3126171211887385, "grad_norm": 3.1875, "learning_rate": 2.6301663040933777e-05, "loss": 0.022567291259765625, "mean_token_accuracy": 0.993626494705677, "num_tokens": 27690161.0, "step": 26225 }, { "entropy": 0.028063594494415155, "epoch": 2.314821755329703, "grad_norm": 0.443359375, "learning_rate": 2.6141000645506786e-05, "loss": 0.036960182189941404, "mean_token_accuracy": 0.9914696502685547, "num_tokens": 27716880.0, "step": 26250 }, { "entropy": 0.0276568841507833, "epoch": 2.3170263894706675, "grad_norm": 2.09375, "learning_rate": 2.598075664019822e-05, "loss": 0.04201683521270752, "mean_token_accuracy": 0.9902168083190918, "num_tokens": 27743023.0, "step": 26275 }, { "entropy": 0.0294130508035596, "epoch": 2.3192310236116316, "grad_norm": 1.234375, "learning_rate": 2.582093193274452e-05, "loss": 0.03933499336242676, "mean_token_accuracy": 0.9877358794212341, "num_tokens": 27769932.0, "step": 26300 }, { "entropy": 0.023152426210435804, "epoch": 2.321435657752596, "grad_norm": 0.490234375, "learning_rate": 2.566152742850697e-05, "loss": 0.02730790138244629, "mean_token_accuracy": 0.9925672444701195, "num_tokens": 27795497.0, "step": 26325 }, { "entropy": 0.03227209112326818, "epoch": 2.32364029189356, "grad_norm": 0.8984375, "learning_rate": 2.5502544030466547e-05, "loss": 0.04344738960266113, "mean_token_accuracy": 0.9895595768094063, "num_tokens": 27822418.0, "step": 26350 }, { "entropy": 0.025001196329394588, "epoch": 2.3258449260345246, "grad_norm": 1.296875, "learning_rate": 2.5343982639218778e-05, "loss": 0.02424089431762695, "mean_token_accuracy": 0.9937809437513352, "num_tokens": 27848000.0, "step": 26375 }, { "entropy": 0.03385282089677275, "epoch": 2.3280495601754887, "grad_norm": 0.283203125, "learning_rate": 2.5185844152968552e-05, "loss": 0.044082775115966796, "mean_token_accuracy": 0.9884610024094581, "num_tokens": 27872983.0, "step": 26400 }, { "entropy": 0.028122581911738963, "epoch": 2.330254194316453, "grad_norm": 0.130859375, "learning_rate": 2.502812946752523e-05, "loss": 0.033366072177886966, "mean_token_accuracy": 0.9861757379770278, "num_tokens": 27900430.0, "step": 26425 }, { "entropy": 0.03773002337271464, "epoch": 2.3324588284574173, "grad_norm": 0.62109375, "learning_rate": 2.4870839476297437e-05, "loss": 0.04225398063659668, "mean_token_accuracy": 0.988186694085598, "num_tokens": 27929118.0, "step": 26450 }, { "entropy": 0.02792194237914373, "epoch": 2.3346634625983818, "grad_norm": 1.3359375, "learning_rate": 2.4713975070287986e-05, "loss": 0.03388465404510498, "mean_token_accuracy": 0.9922302371263504, "num_tokens": 27955837.0, "step": 26475 }, { "entropy": 0.026086576611960482, "epoch": 2.3368680967393463, "grad_norm": 0.275390625, "learning_rate": 2.4557537138088872e-05, "loss": 0.03237960815429688, "mean_token_accuracy": 0.9911448901891708, "num_tokens": 27982908.0, "step": 26500 }, { "entropy": 0.0317309611546807, "epoch": 2.3390727308803103, "grad_norm": 1.6484375, "learning_rate": 2.4401526565876286e-05, "loss": 0.03967868804931641, "mean_token_accuracy": 0.9839690843224526, "num_tokens": 28010234.0, "step": 26525 }, { "entropy": 0.023329728747648916, "epoch": 2.341277365021275, "grad_norm": 0.8828125, "learning_rate": 2.4245944237405525e-05, "loss": 0.01898858666419983, "mean_token_accuracy": 0.9931737449765206, "num_tokens": 28035715.0, "step": 26550 }, { "entropy": 0.02648252934213815, "epoch": 2.343481999162239, "grad_norm": 0.4140625, "learning_rate": 2.4090791034006044e-05, "loss": 0.02758594036102295, "mean_token_accuracy": 0.9942780634760857, "num_tokens": 28061283.0, "step": 26575 }, { "entropy": 0.03423773757087474, "epoch": 2.3456866333032034, "grad_norm": 2.625, "learning_rate": 2.3936067834576324e-05, "loss": 0.045157794952392576, "mean_token_accuracy": 0.9892179015278816, "num_tokens": 28089089.0, "step": 26600 }, { "entropy": 0.03164266299670999, "epoch": 2.347891267444168, "grad_norm": 0.3984375, "learning_rate": 2.3781775515579087e-05, "loss": 0.03755315780639648, "mean_token_accuracy": 0.9894537970423698, "num_tokens": 28115789.0, "step": 26625 }, { "entropy": 0.027904597169217595, "epoch": 2.350095901585132, "grad_norm": 0.62109375, "learning_rate": 2.3627914951036212e-05, "loss": 0.03878262996673584, "mean_token_accuracy": 0.9887554702162743, "num_tokens": 28141834.0, "step": 26650 }, { "entropy": 0.026722210118896327, "epoch": 2.3523005357260964, "grad_norm": 0.431640625, "learning_rate": 2.347448701252386e-05, "loss": 0.02975456714630127, "mean_token_accuracy": 0.9917343974113464, "num_tokens": 28168832.0, "step": 26675 }, { "entropy": 0.027321821880686912, "epoch": 2.3545051698670605, "grad_norm": 0.306640625, "learning_rate": 2.3321492569167402e-05, "loss": 0.033426897525787355, "mean_token_accuracy": 0.9895308339595794, "num_tokens": 28195876.0, "step": 26700 }, { "entropy": 0.028027048043622927, "epoch": 2.356709804008025, "grad_norm": 0.49609375, "learning_rate": 2.3168932487636595e-05, "loss": 0.0321111536026001, "mean_token_accuracy": 0.9926772129535675, "num_tokens": 28222533.0, "step": 26725 }, { "entropy": 0.03335825680707785, "epoch": 2.358914438148989, "grad_norm": 2.53125, "learning_rate": 2.3016807632140735e-05, "loss": 0.03918349027633667, "mean_token_accuracy": 0.9899929386377334, "num_tokens": 28249493.0, "step": 26750 }, { "entropy": 0.02355239788603285, "epoch": 2.3611190722899535, "grad_norm": 0.337890625, "learning_rate": 2.286511886442365e-05, "loss": 0.027818148136138917, "mean_token_accuracy": 0.9934883451461792, "num_tokens": 28274936.0, "step": 26775 }, { "entropy": 0.03090159765084536, "epoch": 2.3633237064309176, "grad_norm": 0.69140625, "learning_rate": 2.271386704375881e-05, "loss": 0.0381850004196167, "mean_token_accuracy": 0.9901610732078552, "num_tokens": 28302998.0, "step": 26800 }, { "entropy": 0.030783424060173273, "epoch": 2.365528340571882, "grad_norm": 2.46875, "learning_rate": 2.2563053026944557e-05, "loss": 0.03481113433837891, "mean_token_accuracy": 0.9903145882487298, "num_tokens": 28330381.0, "step": 26825 }, { "entropy": 0.02382873230339101, "epoch": 2.3677329747128466, "grad_norm": 0.470703125, "learning_rate": 2.2412677668299197e-05, "loss": 0.022453012466430663, "mean_token_accuracy": 0.9938893175125122, "num_tokens": 28356826.0, "step": 26850 }, { "entropy": 0.030640801413319423, "epoch": 2.3699376088538107, "grad_norm": 1.296875, "learning_rate": 2.2262741819656173e-05, "loss": 0.043948798179626464, "mean_token_accuracy": 0.989762376844883, "num_tokens": 28383268.0, "step": 26875 }, { "entropy": 0.03442299180154805, "epoch": 2.372142242994775, "grad_norm": 1.484375, "learning_rate": 2.211324633035916e-05, "loss": 0.04683504581451416, "mean_token_accuracy": 0.9879697850346565, "num_tokens": 28409715.0, "step": 26900 }, { "entropy": 0.028344044167606625, "epoch": 2.374346877135739, "grad_norm": 0.55078125, "learning_rate": 2.1964192047257415e-05, "loss": 0.03574753046035767, "mean_token_accuracy": 0.9921541050076484, "num_tokens": 28436311.0, "step": 26925 }, { "entropy": 0.029762789253472876, "epoch": 2.3765515112767037, "grad_norm": 0.478515625, "learning_rate": 2.1815579814700793e-05, "loss": 0.034858622550964356, "mean_token_accuracy": 0.9897570988535881, "num_tokens": 28463719.0, "step": 26950 }, { "entropy": 0.02525413668121473, "epoch": 2.3787561454176678, "grad_norm": 2.203125, "learning_rate": 2.1667410474535134e-05, "loss": 0.029147915840148926, "mean_token_accuracy": 0.9873283988237381, "num_tokens": 28490183.0, "step": 26975 }, { "entropy": 0.024068699735253177, "epoch": 2.3809607795586323, "grad_norm": 0.765625, "learning_rate": 2.1519684866097432e-05, "loss": 0.022618768215179445, "mean_token_accuracy": 0.992685379087925, "num_tokens": 28515389.0, "step": 27000 }, { "epoch": 2.3809607795586323, "eval_entropy": 0.01530946354798135, "eval_loss": 0.024163657799363136, "eval_mean_token_accuracy": 0.9927907401765957, "eval_num_tokens": 28515389.0, "eval_runtime": 226.0753, "eval_samples_per_second": 17.379, "eval_steps_per_second": 4.348, "step": 27000 }, { "entropy": 0.0272020617283124, "epoch": 2.3831654136995963, "grad_norm": 1.046875, "learning_rate": 2.1372403826210974e-05, "loss": 0.039848828315734865, "mean_token_accuracy": 0.9914286798238754, "num_tokens": 28541578.0, "step": 27025 }, { "entropy": 0.025422237896564184, "epoch": 2.385370047840561, "grad_norm": 2.703125, "learning_rate": 2.1225568189180768e-05, "loss": 0.02730207920074463, "mean_token_accuracy": 0.9945016172528267, "num_tokens": 28567968.0, "step": 27050 }, { "entropy": 0.030367342436147738, "epoch": 2.3875746819815253, "grad_norm": 1.984375, "learning_rate": 2.1079178786788735e-05, "loss": 0.04000374317169189, "mean_token_accuracy": 0.9900528371334076, "num_tokens": 28594455.0, "step": 27075 }, { "entropy": 0.025867001799779247, "epoch": 2.3897793161224894, "grad_norm": 0.6796875, "learning_rate": 2.0933236448289006e-05, "loss": 0.027899935245513915, "mean_token_accuracy": 0.9905596524477005, "num_tokens": 28619924.0, "step": 27100 }, { "entropy": 0.026652738297161704, "epoch": 2.391983950263454, "grad_norm": 0.46875, "learning_rate": 2.0787742000403177e-05, "loss": 0.031079134941101073, "mean_token_accuracy": 0.9919494143128396, "num_tokens": 28646733.0, "step": 27125 }, { "entropy": 0.028722508352439037, "epoch": 2.394188584404418, "grad_norm": 1.4375, "learning_rate": 2.064269626731573e-05, "loss": 0.03450024604797363, "mean_token_accuracy": 0.9902220144867897, "num_tokens": 28672305.0, "step": 27150 }, { "entropy": 0.02438920122654963, "epoch": 2.3963932185453825, "grad_norm": 1.828125, "learning_rate": 2.0498100070669256e-05, "loss": 0.031473851203918456, "mean_token_accuracy": 0.9928356763720513, "num_tokens": 28699490.0, "step": 27175 }, { "entropy": 0.02671077159993729, "epoch": 2.3985978526863465, "grad_norm": 0.490234375, "learning_rate": 2.0353954229559925e-05, "loss": 0.03216356515884399, "mean_token_accuracy": 0.9926029777526856, "num_tokens": 28725754.0, "step": 27200 }, { "entropy": 0.028233824653707416, "epoch": 2.400802486827311, "grad_norm": 0.72265625, "learning_rate": 2.0210259560532652e-05, "loss": 0.03836059808731079, "mean_token_accuracy": 0.9901475051045417, "num_tokens": 28753121.0, "step": 27225 }, { "entropy": 0.025337186203068997, "epoch": 2.403007120968275, "grad_norm": 4.28125, "learning_rate": 2.0067016877576705e-05, "loss": 0.02860053539276123, "mean_token_accuracy": 0.9930732557177544, "num_tokens": 28778679.0, "step": 27250 }, { "entropy": 0.02454310272631119, "epoch": 2.4052117551092396, "grad_norm": 0.451171875, "learning_rate": 1.9924226992120922e-05, "loss": 0.02276606559753418, "mean_token_accuracy": 0.9938413736224174, "num_tokens": 28805474.0, "step": 27275 }, { "entropy": 0.03040276386695041, "epoch": 2.407416389250204, "grad_norm": 2.5, "learning_rate": 1.978189071302923e-05, "loss": 0.03786967277526856, "mean_token_accuracy": 0.9903272116184234, "num_tokens": 28832117.0, "step": 27300 }, { "entropy": 0.02637321831722147, "epoch": 2.409621023391168, "grad_norm": 0.2734375, "learning_rate": 1.9640008846595882e-05, "loss": 0.028791847229003905, "mean_token_accuracy": 0.9920576027035714, "num_tokens": 28858739.0, "step": 27325 }, { "entropy": 0.02872723880540434, "epoch": 2.4118256575321326, "grad_norm": 0.34765625, "learning_rate": 1.9498582196541182e-05, "loss": 0.02915778636932373, "mean_token_accuracy": 0.9923367646336555, "num_tokens": 28884949.0, "step": 27350 }, { "entropy": 0.026446953547056184, "epoch": 2.4140302916730967, "grad_norm": 0.52734375, "learning_rate": 1.9357611564006594e-05, "loss": 0.03899633646011352, "mean_token_accuracy": 0.9909557834267616, "num_tokens": 28911648.0, "step": 27375 }, { "entropy": 0.028178447782229343, "epoch": 2.416234925814061, "grad_norm": 1.5546875, "learning_rate": 1.9217097747550518e-05, "loss": 0.03861397743225098, "mean_token_accuracy": 0.990208261013031, "num_tokens": 28937290.0, "step": 27400 }, { "entropy": 0.02791525627584633, "epoch": 2.4184395599550257, "grad_norm": 1.65625, "learning_rate": 1.9077041543143515e-05, "loss": 0.034581294059753416, "mean_token_accuracy": 0.989730831682682, "num_tokens": 28963573.0, "step": 27425 }, { "entropy": 0.02653196087485412, "epoch": 2.4206441940959897, "grad_norm": 0.54296875, "learning_rate": 1.893744374416395e-05, "loss": 0.028083674907684326, "mean_token_accuracy": 0.9924825802445412, "num_tokens": 28990395.0, "step": 27450 }, { "entropy": 0.0226342674649095, "epoch": 2.4228488282369542, "grad_norm": 1.875, "learning_rate": 1.8798305141393468e-05, "loss": 0.03195946455001831, "mean_token_accuracy": 0.9923165252804756, "num_tokens": 29016564.0, "step": 27475 }, { "entropy": 0.02829852351467707, "epoch": 2.4250534623779183, "grad_norm": 1.28125, "learning_rate": 1.865962652301251e-05, "loss": 0.03136762857437134, "mean_token_accuracy": 0.9916561701893807, "num_tokens": 29043714.0, "step": 27500 }, { "entropy": 0.027737032295808605, "epoch": 2.427258096518883, "grad_norm": 0.70703125, "learning_rate": 1.8521408674595742e-05, "loss": 0.027605950832366943, "mean_token_accuracy": 0.9930538147687912, "num_tokens": 29070351.0, "step": 27525 }, { "entropy": 0.02750062160557718, "epoch": 2.429462730659847, "grad_norm": 0.8828125, "learning_rate": 1.8383652379107787e-05, "loss": 0.033921008110046384, "mean_token_accuracy": 0.9925847980380058, "num_tokens": 29096293.0, "step": 27550 }, { "entropy": 0.02087086517283751, "epoch": 2.4316673648008114, "grad_norm": 4.625, "learning_rate": 1.8246358416898724e-05, "loss": 0.026158723831176758, "mean_token_accuracy": 0.9937206152081489, "num_tokens": 29121703.0, "step": 27575 }, { "entropy": 0.02890322234330597, "epoch": 2.4338719989417754, "grad_norm": 0.640625, "learning_rate": 1.8109527565699536e-05, "loss": 0.037437882423400876, "mean_token_accuracy": 0.9888174629211426, "num_tokens": 29148133.0, "step": 27600 }, { "entropy": 0.02840543278405676, "epoch": 2.43607663308274, "grad_norm": 0.83984375, "learning_rate": 1.7973160600617944e-05, "loss": 0.029362483024597166, "mean_token_accuracy": 0.9919123184680939, "num_tokens": 29174862.0, "step": 27625 }, { "entropy": 0.029121692281260037, "epoch": 2.4382812672237044, "grad_norm": 2.859375, "learning_rate": 1.7837258294133764e-05, "loss": 0.033895456790924074, "mean_token_accuracy": 0.9921913406252861, "num_tokens": 29201323.0, "step": 27650 }, { "entropy": 0.026827582789373992, "epoch": 2.4404859013646685, "grad_norm": 0.77734375, "learning_rate": 1.7701821416094745e-05, "loss": 0.029248933792114257, "mean_token_accuracy": 0.9929554259777069, "num_tokens": 29227216.0, "step": 27675 }, { "entropy": 0.020511043658625567, "epoch": 2.442690535505633, "grad_norm": 1.5859375, "learning_rate": 1.7566850733712105e-05, "loss": 0.02211562156677246, "mean_token_accuracy": 0.9927102276682853, "num_tokens": 29252647.0, "step": 27700 }, { "entropy": 0.025656236830272974, "epoch": 2.444895169646597, "grad_norm": 1.015625, "learning_rate": 1.743234701155614e-05, "loss": 0.02908522605895996, "mean_token_accuracy": 0.9913544818758965, "num_tokens": 29278335.0, "step": 27725 }, { "entropy": 0.022964876653477403, "epoch": 2.4470998037875615, "grad_norm": 1.7734375, "learning_rate": 1.7298311011551995e-05, "loss": 0.02672830820083618, "mean_token_accuracy": 0.9933206418156624, "num_tokens": 29303676.0, "step": 27750 }, { "entropy": 0.03414529487632535, "epoch": 2.4493044379285256, "grad_norm": 0.88671875, "learning_rate": 1.716474349297531e-05, "loss": 0.04021317481994629, "mean_token_accuracy": 0.9896806997060775, "num_tokens": 29331995.0, "step": 27775 }, { "entropy": 0.024354764178715415, "epoch": 2.45150907206949, "grad_norm": 2.890625, "learning_rate": 1.7031645212447913e-05, "loss": 0.029198453426361085, "mean_token_accuracy": 0.9919512045383453, "num_tokens": 29357249.0, "step": 27800 }, { "entropy": 0.03601440033242397, "epoch": 2.453713706210454, "grad_norm": 4.28125, "learning_rate": 1.68990169239335e-05, "loss": 0.04773323059082031, "mean_token_accuracy": 0.9883570069074631, "num_tokens": 29384591.0, "step": 27825 }, { "entropy": 0.025379649180413254, "epoch": 2.4559183403514186, "grad_norm": 0.91015625, "learning_rate": 1.676685937873337e-05, "loss": 0.03554457426071167, "mean_token_accuracy": 0.9918042355775833, "num_tokens": 29411165.0, "step": 27850 }, { "entropy": 0.030096235661039826, "epoch": 2.458122974492383, "grad_norm": 0.51171875, "learning_rate": 1.663517332548229e-05, "loss": 0.03285525560379028, "mean_token_accuracy": 0.9914932417869567, "num_tokens": 29436990.0, "step": 27875 }, { "entropy": 0.02419358774619468, "epoch": 2.460327608633347, "grad_norm": 3.609375, "learning_rate": 1.6503959510144106e-05, "loss": 0.03251128196716309, "mean_token_accuracy": 0.9923573270440101, "num_tokens": 29462865.0, "step": 27900 }, { "entropy": 0.028468720067612596, "epoch": 2.4625322427743117, "grad_norm": 0.2021484375, "learning_rate": 1.6373218676007553e-05, "loss": 0.0337121057510376, "mean_token_accuracy": 0.9913557228446007, "num_tokens": 29489633.0, "step": 27925 }, { "entropy": 0.024808133746118982, "epoch": 2.4647368769152758, "grad_norm": 0.11767578125, "learning_rate": 1.6242951563682086e-05, "loss": 0.028612098693847655, "mean_token_accuracy": 0.9922372248768806, "num_tokens": 29514759.0, "step": 27950 }, { "entropy": 0.026218910121970113, "epoch": 2.4669415110562403, "grad_norm": 1.6875, "learning_rate": 1.611315891109367e-05, "loss": 0.02892878532409668, "mean_token_accuracy": 0.9922880592942238, "num_tokens": 29540408.0, "step": 27975 }, { "entropy": 0.028717517795703317, "epoch": 2.4691461451972048, "grad_norm": 3.234375, "learning_rate": 1.5983841453480576e-05, "loss": 0.03306084871292114, "mean_token_accuracy": 0.9898355248570442, "num_tokens": 29567167.0, "step": 28000 }, { "epoch": 2.4691461451972048, "eval_entropy": 0.015174210387057532, "eval_loss": 0.024244122207164764, "eval_mean_token_accuracy": 0.9927880360175488, "eval_num_tokens": 29567167.0, "eval_runtime": 246.3571, "eval_samples_per_second": 15.948, "eval_steps_per_second": 3.99, "step": 28000 }, { "entropy": 0.02790546601085225, "epoch": 2.471350779338169, "grad_norm": 3.484375, "learning_rate": 1.5854999923389258e-05, "loss": 0.031562213897705076, "mean_token_accuracy": 0.9908042460680008, "num_tokens": 29593800.0, "step": 28025 }, { "entropy": 0.029840830979082966, "epoch": 2.4735554134791333, "grad_norm": 1.7890625, "learning_rate": 1.5726635050670123e-05, "loss": 0.036235096454620364, "mean_token_accuracy": 0.9905793526768685, "num_tokens": 29619591.0, "step": 28050 }, { "entropy": 0.029747921650014178, "epoch": 2.4757600476200974, "grad_norm": 1.1796875, "learning_rate": 1.559874756247347e-05, "loss": 0.03456358909606934, "mean_token_accuracy": 0.9919943836331367, "num_tokens": 29646045.0, "step": 28075 }, { "entropy": 0.02587532730682142, "epoch": 2.477964681761062, "grad_norm": 0.4609375, "learning_rate": 1.5471338183245386e-05, "loss": 0.032157759666442874, "mean_token_accuracy": 0.9919455042481422, "num_tokens": 29673216.0, "step": 28100 }, { "entropy": 0.023125849339194246, "epoch": 2.480169315902026, "grad_norm": 0.06591796875, "learning_rate": 1.534440763472361e-05, "loss": 0.028566434383392333, "mean_token_accuracy": 0.9923149171471596, "num_tokens": 29698828.0, "step": 28125 }, { "entropy": 0.028684415022034955, "epoch": 2.4823739500429904, "grad_norm": 0.376953125, "learning_rate": 1.521795663593335e-05, "loss": 0.033868846893310545, "mean_token_accuracy": 0.9930822342634201, "num_tokens": 29724874.0, "step": 28150 }, { "entropy": 0.02750486832945171, "epoch": 2.4845785841839545, "grad_norm": 0.95703125, "learning_rate": 1.5091985903183415e-05, "loss": 0.03277843952178955, "mean_token_accuracy": 0.9917207375168801, "num_tokens": 29751693.0, "step": 28175 }, { "entropy": 0.02878666256321594, "epoch": 2.486783218324919, "grad_norm": 3.671875, "learning_rate": 1.496649615006197e-05, "loss": 0.03613584280014038, "mean_token_accuracy": 0.9895001071691513, "num_tokens": 29779275.0, "step": 28200 }, { "entropy": 0.023778437117980503, "epoch": 2.4889878524658835, "grad_norm": 0.158203125, "learning_rate": 1.4841488087432642e-05, "loss": 0.035824286937713626, "mean_token_accuracy": 0.991601872742176, "num_tokens": 29805152.0, "step": 28225 }, { "entropy": 0.028207669479197648, "epoch": 2.4911924866068476, "grad_norm": 1.46875, "learning_rate": 1.4716962423430314e-05, "loss": 0.03573413610458374, "mean_token_accuracy": 0.9917686119675636, "num_tokens": 29831523.0, "step": 28250 }, { "entropy": 0.027834644988688526, "epoch": 2.493397120747812, "grad_norm": 1.1640625, "learning_rate": 1.4592919863457332e-05, "loss": 0.03196176767349243, "mean_token_accuracy": 0.9919345453381538, "num_tokens": 29858086.0, "step": 28275 }, { "entropy": 0.030679253008656815, "epoch": 2.495601754888776, "grad_norm": 0.90234375, "learning_rate": 1.446936111017928e-05, "loss": 0.03689356803894043, "mean_token_accuracy": 0.9908839726448059, "num_tokens": 29883903.0, "step": 28300 }, { "entropy": 0.027692356215484323, "epoch": 2.4978063890297406, "grad_norm": 0.53515625, "learning_rate": 1.434628686352123e-05, "loss": 0.030295934677124024, "mean_token_accuracy": 0.9929940468072891, "num_tokens": 29910032.0, "step": 28325 }, { "entropy": 0.03307623949665867, "epoch": 2.5000110231707047, "grad_norm": 4.3125, "learning_rate": 1.422369782066355e-05, "loss": 0.042545747756958005, "mean_token_accuracy": 0.9901801961660385, "num_tokens": 29937722.0, "step": 28350 }, { "entropy": 0.024893675402163352, "epoch": 2.502215657311669, "grad_norm": 0.47265625, "learning_rate": 1.4101594676038165e-05, "loss": 0.03002509593963623, "mean_token_accuracy": 0.9913722136616707, "num_tokens": 29963785.0, "step": 28375 }, { "entropy": 0.02339222040751338, "epoch": 2.5044202914526332, "grad_norm": 0.703125, "learning_rate": 1.3979978121324488e-05, "loss": 0.025294029712677003, "mean_token_accuracy": 0.9939557710289955, "num_tokens": 29990305.0, "step": 28400 }, { "entropy": 0.021295039523283776, "epoch": 2.5066249255935977, "grad_norm": 1.0859375, "learning_rate": 1.3858848845445538e-05, "loss": 0.01983662486076355, "mean_token_accuracy": 0.9948306560516358, "num_tokens": 30015670.0, "step": 28425 }, { "entropy": 0.025478222227975494, "epoch": 2.5088295597345622, "grad_norm": 2.46875, "learning_rate": 1.3738207534564007e-05, "loss": 0.0268137526512146, "mean_token_accuracy": 0.9927583593130112, "num_tokens": 30041298.0, "step": 28450 }, { "entropy": 0.027219222592430016, "epoch": 2.5110341938755263, "grad_norm": 0.6328125, "learning_rate": 1.3618054872078456e-05, "loss": 0.03663578748703003, "mean_token_accuracy": 0.9898635944724083, "num_tokens": 30067356.0, "step": 28475 }, { "entropy": 0.02611872347224562, "epoch": 2.513238828016491, "grad_norm": 0.5234375, "learning_rate": 1.3498391538619405e-05, "loss": 0.029873659610748293, "mean_token_accuracy": 0.9930629214644432, "num_tokens": 30094241.0, "step": 28500 }, { "entropy": 0.029561223531491123, "epoch": 2.515443462157455, "grad_norm": 1.0078125, "learning_rate": 1.3379218212045386e-05, "loss": 0.029539554119110106, "mean_token_accuracy": 0.9937109515070915, "num_tokens": 30119806.0, "step": 28525 }, { "entropy": 0.027742599264747696, "epoch": 2.5176480962984193, "grad_norm": 2.640625, "learning_rate": 1.3260535567439215e-05, "loss": 0.027226037979125976, "mean_token_accuracy": 0.9915308326482772, "num_tokens": 30145291.0, "step": 28550 }, { "entropy": 0.029665047602356935, "epoch": 2.519852730439384, "grad_norm": 2.03125, "learning_rate": 1.3142344277104169e-05, "loss": 0.02815444231033325, "mean_token_accuracy": 0.9921803346276283, "num_tokens": 30172389.0, "step": 28575 }, { "entropy": 0.03133596465810115, "epoch": 2.522057364580348, "grad_norm": 1.1015625, "learning_rate": 1.3024645010560133e-05, "loss": 0.03243262767791748, "mean_token_accuracy": 0.9912997674942017, "num_tokens": 30200862.0, "step": 28600 }, { "entropy": 0.02291832109705865, "epoch": 2.524261998721312, "grad_norm": 1.2109375, "learning_rate": 1.290743843453982e-05, "loss": 0.028611266613006593, "mean_token_accuracy": 0.992739563882351, "num_tokens": 30225486.0, "step": 28625 }, { "entropy": 0.024033257207702264, "epoch": 2.5264666328622765, "grad_norm": 1.1171875, "learning_rate": 1.2790725212984945e-05, "loss": 0.028946173191070557, "mean_token_accuracy": 0.9921584859490394, "num_tokens": 30252052.0, "step": 28650 }, { "entropy": 0.03086577148964352, "epoch": 2.528671267003241, "grad_norm": 0.62109375, "learning_rate": 1.2674506007042542e-05, "loss": 0.04069224357604981, "mean_token_accuracy": 0.9903390213847161, "num_tokens": 30279085.0, "step": 28675 }, { "entropy": 0.02383694459356775, "epoch": 2.530875901144205, "grad_norm": 1.2890625, "learning_rate": 1.255878147506122e-05, "loss": 0.029977023601531982, "mean_token_accuracy": 0.991557080745697, "num_tokens": 30305409.0, "step": 28700 }, { "entropy": 0.027153952862045118, "epoch": 2.5330805352851695, "grad_norm": 0.69140625, "learning_rate": 1.2443552272587377e-05, "loss": 0.03152206897735596, "mean_token_accuracy": 0.9924914839863778, "num_tokens": 30332015.0, "step": 28725 }, { "entropy": 0.025579888974643836, "epoch": 2.5352851694261336, "grad_norm": 0.87109375, "learning_rate": 1.2328819052361495e-05, "loss": 0.028663196563720704, "mean_token_accuracy": 0.992575915157795, "num_tokens": 30359014.0, "step": 28750 }, { "entropy": 0.02768795941843564, "epoch": 2.537489803567098, "grad_norm": 0.1943359375, "learning_rate": 1.2214582464314428e-05, "loss": 0.02344026803970337, "mean_token_accuracy": 0.994471182525158, "num_tokens": 30385317.0, "step": 28775 }, { "entropy": 0.022483936410135356, "epoch": 2.5396944377080626, "grad_norm": 2.125, "learning_rate": 1.2100843155563845e-05, "loss": 0.02409367561340332, "mean_token_accuracy": 0.9890817698836326, "num_tokens": 30411445.0, "step": 28800 }, { "entropy": 0.0211687412471656, "epoch": 2.5418990718490266, "grad_norm": 0.4609375, "learning_rate": 1.1987601770410406e-05, "loss": 0.022674081325531007, "mean_token_accuracy": 0.9933211871981621, "num_tokens": 30436622.0, "step": 28825 }, { "entropy": 0.024043053868699645, "epoch": 2.5441037059899907, "grad_norm": 1.15625, "learning_rate": 1.1874858950334167e-05, "loss": 0.025228850841522217, "mean_token_accuracy": 0.9945827552676201, "num_tokens": 30462046.0, "step": 28850 }, { "entropy": 0.026052604026444895, "epoch": 2.546308340130955, "grad_norm": 0.62109375, "learning_rate": 1.1762615333990989e-05, "loss": 0.03171946048736572, "mean_token_accuracy": 0.9917865255475045, "num_tokens": 30488101.0, "step": 28875 }, { "entropy": 0.030083065580874972, "epoch": 2.5485129742719197, "grad_norm": 0.71875, "learning_rate": 1.1650871557208876e-05, "loss": 0.03770236015319824, "mean_token_accuracy": 0.9915402907133103, "num_tokens": 30515183.0, "step": 28900 }, { "entropy": 0.022853468903722387, "epoch": 2.5507176084128838, "grad_norm": 1.2421875, "learning_rate": 1.1539628252984403e-05, "loss": 0.02523197889328003, "mean_token_accuracy": 0.9935995003581047, "num_tokens": 30540057.0, "step": 28925 }, { "entropy": 0.029179727130540414, "epoch": 2.5529222425538483, "grad_norm": 0.337890625, "learning_rate": 1.1428886051479037e-05, "loss": 0.03404136896133423, "mean_token_accuracy": 0.9906327998638154, "num_tokens": 30566287.0, "step": 28950 }, { "entropy": 0.02455010826686703, "epoch": 2.5551268766948123, "grad_norm": 0.88671875, "learning_rate": 1.1318645580015752e-05, "loss": 0.03312621593475342, "mean_token_accuracy": 0.992313320338726, "num_tokens": 30591905.0, "step": 28975 }, { "entropy": 0.024439287170152965, "epoch": 2.557331510835777, "grad_norm": 0.91796875, "learning_rate": 1.1208907463075246e-05, "loss": 0.02883162260055542, "mean_token_accuracy": 0.9913711741566658, "num_tokens": 30617761.0, "step": 29000 }, { "epoch": 2.557331510835777, "eval_entropy": 0.01511552784521922, "eval_loss": 0.024238504469394684, "eval_mean_token_accuracy": 0.9927981172670304, "eval_num_tokens": 30617761.0, "eval_runtime": 227.0085, "eval_samples_per_second": 17.308, "eval_steps_per_second": 4.33, "step": 29000 }, { "entropy": 0.028338864904362708, "epoch": 2.5595361449767413, "grad_norm": 0.41796875, "learning_rate": 1.1099672322292621e-05, "loss": 0.04463479518890381, "mean_token_accuracy": 0.9912523052096367, "num_tokens": 30643640.0, "step": 29025 }, { "entropy": 0.027401521958090598, "epoch": 2.5617407791177054, "grad_norm": 0.97265625, "learning_rate": 1.0990940776453728e-05, "loss": 0.0312890076637268, "mean_token_accuracy": 0.9916872721910477, "num_tokens": 30669460.0, "step": 29050 }, { "entropy": 0.02932647341722259, "epoch": 2.56394541325867, "grad_norm": 2.34375, "learning_rate": 1.0882713441491653e-05, "loss": 0.03259019613265991, "mean_token_accuracy": 0.9919594395160675, "num_tokens": 30695879.0, "step": 29075 }, { "entropy": 0.02201111441543617, "epoch": 2.566150047399634, "grad_norm": 1.0, "learning_rate": 1.0774990930483354e-05, "loss": 0.03443076848983764, "mean_token_accuracy": 0.9926259750127793, "num_tokens": 30721398.0, "step": 29100 }, { "entropy": 0.031130029212763474, "epoch": 2.5683546815405984, "grad_norm": 2.703125, "learning_rate": 1.0667773853646034e-05, "loss": 0.05192684173583984, "mean_token_accuracy": 0.9869232523441315, "num_tokens": 30750109.0, "step": 29125 }, { "entropy": 0.02571553880090505, "epoch": 2.5705593156815625, "grad_norm": 0.9453125, "learning_rate": 1.0561062818333822e-05, "loss": 0.029610633850097656, "mean_token_accuracy": 0.9921117216348648, "num_tokens": 30776603.0, "step": 29150 }, { "entropy": 0.026652264393324004, "epoch": 2.572763949822527, "grad_norm": 0.71875, "learning_rate": 1.0454858429034187e-05, "loss": 0.03440707683563232, "mean_token_accuracy": 0.9905963116884231, "num_tokens": 30802740.0, "step": 29175 }, { "entropy": 0.024566626491359784, "epoch": 2.574968583963491, "grad_norm": 1.015625, "learning_rate": 1.0349161287364673e-05, "loss": 0.028398797512054444, "mean_token_accuracy": 0.992349998652935, "num_tokens": 30829022.0, "step": 29200 }, { "entropy": 0.03593797193334467, "epoch": 2.5771732181044555, "grad_norm": 0.70703125, "learning_rate": 1.0243971992069334e-05, "loss": 0.05232769966125488, "mean_token_accuracy": 0.9877561151981353, "num_tokens": 30856036.0, "step": 29225 }, { "entropy": 0.030079898621734172, "epoch": 2.57937785224542, "grad_norm": 0.115234375, "learning_rate": 1.0139291139015495e-05, "loss": 0.03709140777587891, "mean_token_accuracy": 0.9910090592503548, "num_tokens": 30883425.0, "step": 29250 }, { "entropy": 0.029405801608154435, "epoch": 2.581582486386384, "grad_norm": 0.048095703125, "learning_rate": 1.0035119321190233e-05, "loss": 0.031949334144592285, "mean_token_accuracy": 0.991183122098446, "num_tokens": 30909134.0, "step": 29275 }, { "entropy": 0.028425286148940358, "epoch": 2.5837871205273486, "grad_norm": 1.6953125, "learning_rate": 9.931457128697131e-06, "loss": 0.0320594334602356, "mean_token_accuracy": 0.9920493552088737, "num_tokens": 30936305.0, "step": 29300 }, { "entropy": 0.027195464314481797, "epoch": 2.5859917546683127, "grad_norm": 0.625, "learning_rate": 9.828305148752881e-06, "loss": 0.032163431644439695, "mean_token_accuracy": 0.9921726885437966, "num_tokens": 30963031.0, "step": 29325 }, { "entropy": 0.027875148742787133, "epoch": 2.588196388809277, "grad_norm": 0.63671875, "learning_rate": 9.725663965683984e-06, "loss": 0.03857534646987915, "mean_token_accuracy": 0.9911905533075333, "num_tokens": 30990558.0, "step": 29350 }, { "entropy": 0.026536175619912682, "epoch": 2.5904010229502417, "grad_norm": 0.98046875, "learning_rate": 9.623534160923353e-06, "loss": 0.028181934356689455, "mean_token_accuracy": 0.9915832805633545, "num_tokens": 31016748.0, "step": 29375 }, { "entropy": 0.028007027512248896, "epoch": 2.5926056570912057, "grad_norm": 1.2578125, "learning_rate": 9.521916313007173e-06, "loss": 0.035958237648010254, "mean_token_accuracy": 0.9915932086110115, "num_tokens": 31044073.0, "step": 29400 }, { "entropy": 0.02840741300540685, "epoch": 2.5948102912321698, "grad_norm": 1.7109375, "learning_rate": 9.420810997571516e-06, "loss": 0.03402619361877442, "mean_token_accuracy": 0.9930353647470475, "num_tokens": 31070655.0, "step": 29425 }, { "entropy": 0.03006437236908823, "epoch": 2.5970149253731343, "grad_norm": 1.0, "learning_rate": 9.320218787349066e-06, "loss": 0.043330874443054196, "mean_token_accuracy": 0.9889759740233421, "num_tokens": 31097111.0, "step": 29450 }, { "entropy": 0.03026204977832094, "epoch": 2.5992195595140988, "grad_norm": 1.46875, "learning_rate": 9.220140252165931e-06, "loss": 0.03800657749176026, "mean_token_accuracy": 0.9887870910763741, "num_tokens": 31124102.0, "step": 29475 }, { "entropy": 0.028786359129080664, "epoch": 2.601424193655063, "grad_norm": 2.453125, "learning_rate": 9.120575958938416e-06, "loss": 0.02739781141281128, "mean_token_accuracy": 0.9923294255137444, "num_tokens": 31150596.0, "step": 29500 }, { "entropy": 0.02446969037204326, "epoch": 2.6036288277960273, "grad_norm": 1.546875, "learning_rate": 9.021526471669783e-06, "loss": 0.025637152194976805, "mean_token_accuracy": 0.9926092553138733, "num_tokens": 31177647.0, "step": 29525 }, { "entropy": 0.02096207481219608, "epoch": 2.6058334619369914, "grad_norm": 1.0078125, "learning_rate": 8.922992351447079e-06, "loss": 0.025728282928466798, "mean_token_accuracy": 0.9925812685489654, "num_tokens": 31202611.0, "step": 29550 }, { "entropy": 0.03069892433763016, "epoch": 2.608038096077956, "grad_norm": 1.0625, "learning_rate": 8.824974156437903e-06, "loss": 0.029011659622192383, "mean_token_accuracy": 0.9920935586094857, "num_tokens": 31227743.0, "step": 29575 }, { "entropy": 0.02603791271118098, "epoch": 2.6102427302189204, "grad_norm": 2.9375, "learning_rate": 8.72747244188733e-06, "loss": 0.031118543148040773, "mean_token_accuracy": 0.9923553898930549, "num_tokens": 31254607.0, "step": 29600 }, { "entropy": 0.02094233050909679, "epoch": 2.6124473643598844, "grad_norm": 0.84765625, "learning_rate": 8.630487760114703e-06, "loss": 0.020793261528015135, "mean_token_accuracy": 0.9937394097447395, "num_tokens": 31279142.0, "step": 29625 }, { "entropy": 0.027775917933613527, "epoch": 2.614651998500849, "grad_norm": 1.9453125, "learning_rate": 8.5340206605105e-06, "loss": 0.03821009874343872, "mean_token_accuracy": 0.9896869486570359, "num_tokens": 31304261.0, "step": 29650 }, { "entropy": 0.029270872360466455, "epoch": 2.616856632641813, "grad_norm": 0.208984375, "learning_rate": 8.438071689533288e-06, "loss": 0.03347602367401123, "mean_token_accuracy": 0.9911593902111053, "num_tokens": 31330018.0, "step": 29675 }, { "entropy": 0.02950705551649662, "epoch": 2.6190612667827775, "grad_norm": 0.291015625, "learning_rate": 8.34264139070653e-06, "loss": 0.046378650665283204, "mean_token_accuracy": 0.9907181602716446, "num_tokens": 31356650.0, "step": 29700 }, { "entropy": 0.029709462454957248, "epoch": 2.6212659009237416, "grad_norm": 1.2109375, "learning_rate": 8.247730304615609e-06, "loss": 0.04107787609100342, "mean_token_accuracy": 0.9899954950809479, "num_tokens": 31383748.0, "step": 29725 }, { "entropy": 0.031126942403589056, "epoch": 2.623470535064706, "grad_norm": 0.62109375, "learning_rate": 8.153338968904723e-06, "loss": 0.03897136926651001, "mean_token_accuracy": 0.9887469747662544, "num_tokens": 31410939.0, "step": 29750 }, { "entropy": 0.02810529593278261, "epoch": 2.62567516920567, "grad_norm": 0.69921875, "learning_rate": 8.059467918273756e-06, "loss": 0.02849973678588867, "mean_token_accuracy": 0.9866181939840317, "num_tokens": 31438643.0, "step": 29775 }, { "entropy": 0.027220633971937788, "epoch": 2.6278798033466346, "grad_norm": 1.2734375, "learning_rate": 7.966117684475393e-06, "loss": 0.039316484928131105, "mean_token_accuracy": 0.9915398034453392, "num_tokens": 31466576.0, "step": 29800 }, { "entropy": 0.02613406105301692, "epoch": 2.630084437487599, "grad_norm": 0.859375, "learning_rate": 7.87328879631204e-06, "loss": 0.027877295017242433, "mean_token_accuracy": 0.9921690738201141, "num_tokens": 31494426.0, "step": 29825 }, { "entropy": 0.029777123154854052, "epoch": 2.632289071628563, "grad_norm": 0.8671875, "learning_rate": 7.780981779632823e-06, "loss": 0.03954046487808228, "mean_token_accuracy": 0.9905496680736542, "num_tokens": 31520790.0, "step": 29850 }, { "entropy": 0.031225387039357885, "epoch": 2.6344937057695277, "grad_norm": 0.9609375, "learning_rate": 7.689197157330564e-06, "loss": 0.046720662117004395, "mean_token_accuracy": 0.9887133800983429, "num_tokens": 31546939.0, "step": 29875 }, { "entropy": 0.02381926384405233, "epoch": 2.6366983399104917, "grad_norm": 1.5234375, "learning_rate": 7.597935449338944e-06, "loss": 0.03087226867675781, "mean_token_accuracy": 0.9920753428339958, "num_tokens": 31572240.0, "step": 29900 }, { "entropy": 0.03250588172009884, "epoch": 2.6389029740514562, "grad_norm": 2.375, "learning_rate": 7.507197172629432e-06, "loss": 0.04254406452178955, "mean_token_accuracy": 0.990644511282444, "num_tokens": 31598840.0, "step": 29925 }, { "entropy": 0.031087516101815708, "epoch": 2.6411076081924207, "grad_norm": 1.09375, "learning_rate": 7.416982841208453e-06, "loss": 0.042113037109375, "mean_token_accuracy": 0.987711393237114, "num_tokens": 31626112.0, "step": 29950 }, { "entropy": 0.0267231866216207, "epoch": 2.643312242333385, "grad_norm": 0.55078125, "learning_rate": 7.327292966114374e-06, "loss": 0.026836345195770262, "mean_token_accuracy": 0.9923613077402115, "num_tokens": 31651914.0, "step": 29975 }, { "entropy": 0.026771614199224133, "epoch": 2.645516876474349, "grad_norm": 0.91796875, "learning_rate": 7.238128055414706e-06, "loss": 0.026218461990356445, "mean_token_accuracy": 0.9917361649870873, "num_tokens": 31678089.0, "step": 30000 }, { "epoch": 2.645516876474349, "eval_entropy": 0.015163507781158242, "eval_loss": 0.02418082021176815, "eval_mean_token_accuracy": 0.9927960444442617, "eval_num_tokens": 31678089.0, "eval_runtime": 245.6766, "eval_samples_per_second": 15.993, "eval_steps_per_second": 4.001, "step": 30000 }, { "entropy": 0.030485192148407805, "epoch": 2.6477215106153134, "grad_norm": 2.734375, "learning_rate": 7.1494886142031925e-06, "loss": 0.03966634750366211, "mean_token_accuracy": 0.9914444166421891, "num_tokens": 31704822.0, "step": 30025 }, { "entropy": 0.02233901047420659, "epoch": 2.649926144756278, "grad_norm": 0.80078125, "learning_rate": 7.0613751445969265e-06, "loss": 0.023667454719543457, "mean_token_accuracy": 0.993430680334568, "num_tokens": 31730622.0, "step": 30050 }, { "entropy": 0.024482036822882948, "epoch": 2.652130778897242, "grad_norm": 0.67578125, "learning_rate": 6.9737881457335506e-06, "loss": 0.02852684736251831, "mean_token_accuracy": 0.991863000690937, "num_tokens": 31756387.0, "step": 30075 }, { "entropy": 0.027239963827469182, "epoch": 2.6543354130382064, "grad_norm": 1.0546875, "learning_rate": 6.886728113768337e-06, "loss": 0.03446184396743775, "mean_token_accuracy": 0.9912598931789398, "num_tokens": 31782098.0, "step": 30100 }, { "entropy": 0.02890480452515476, "epoch": 2.6565400471791705, "grad_norm": 1.0078125, "learning_rate": 6.8001955418714905e-06, "loss": 0.036803393363952636, "mean_token_accuracy": 0.9865454795956612, "num_tokens": 31810547.0, "step": 30125 }, { "entropy": 0.029436326650466072, "epoch": 2.658744681320135, "grad_norm": 1.0546875, "learning_rate": 6.714190920225283e-06, "loss": 0.03758874416351318, "mean_token_accuracy": 0.9899130940437317, "num_tokens": 31838640.0, "step": 30150 }, { "entropy": 0.027448534051000024, "epoch": 2.6609493154610995, "grad_norm": 1.71875, "learning_rate": 6.628714736021358e-06, "loss": 0.031783337593078616, "mean_token_accuracy": 0.9929012563824654, "num_tokens": 31863958.0, "step": 30175 }, { "entropy": 0.023979724997479933, "epoch": 2.6631539496020635, "grad_norm": 2.046875, "learning_rate": 6.543767473457807e-06, "loss": 0.026163532733917236, "mean_token_accuracy": 0.9927327701449394, "num_tokens": 31890174.0, "step": 30200 }, { "entropy": 0.028794102249594288, "epoch": 2.6653585837430276, "grad_norm": 0.470703125, "learning_rate": 6.45934961373662e-06, "loss": 0.03421214580535889, "mean_token_accuracy": 0.9915524750947953, "num_tokens": 31915944.0, "step": 30225 }, { "entropy": 0.0330623129722153, "epoch": 2.667563217883992, "grad_norm": 2.796875, "learning_rate": 6.3754616350608424e-06, "loss": 0.047150373458862305, "mean_token_accuracy": 0.9889357024431229, "num_tokens": 31942686.0, "step": 30250 }, { "entropy": 0.02424276659203315, "epoch": 2.6697678520249566, "grad_norm": 0.3671875, "learning_rate": 6.292104012631905e-06, "loss": 0.034151678085327146, "mean_token_accuracy": 0.991973640024662, "num_tokens": 31968763.0, "step": 30275 }, { "entropy": 0.022598919857009604, "epoch": 2.6719724861659206, "grad_norm": 0.345703125, "learning_rate": 6.20927721864687e-06, "loss": 0.024938690662384033, "mean_token_accuracy": 0.9939743283390999, "num_tokens": 31995639.0, "step": 30300 }, { "entropy": 0.027603573197920922, "epoch": 2.674177120306885, "grad_norm": 0.75390625, "learning_rate": 6.1269817222958995e-06, "loss": 0.0316735577583313, "mean_token_accuracy": 0.9919213426113128, "num_tokens": 32021121.0, "step": 30325 }, { "entropy": 0.029339179949456593, "epoch": 2.676381754447849, "grad_norm": 0.8515625, "learning_rate": 6.045217989759411e-06, "loss": 0.036246950626373294, "mean_token_accuracy": 0.9915802147984505, "num_tokens": 32048113.0, "step": 30350 }, { "entropy": 0.02590386390213098, "epoch": 2.6785863885888137, "grad_norm": 1.484375, "learning_rate": 5.9639864842056084e-06, "loss": 0.026949644088745117, "mean_token_accuracy": 0.9931145197153092, "num_tokens": 32074183.0, "step": 30375 }, { "entropy": 0.031569371115929244, "epoch": 2.680791022729778, "grad_norm": 1.5859375, "learning_rate": 5.883287665787729e-06, "loss": 0.03619576454162598, "mean_token_accuracy": 0.9898512217402459, "num_tokens": 32100552.0, "step": 30400 }, { "entropy": 0.03282993205477396, "epoch": 2.6829956568707423, "grad_norm": 3.84375, "learning_rate": 5.803121991641513e-06, "loss": 0.041049656867980955, "mean_token_accuracy": 0.9897631332278252, "num_tokens": 32127816.0, "step": 30425 }, { "entropy": 0.030889862734293273, "epoch": 2.6852002910117068, "grad_norm": 0.76953125, "learning_rate": 5.723489915882607e-06, "loss": 0.029335715770721436, "mean_token_accuracy": 0.992390308380127, "num_tokens": 32155551.0, "step": 30450 }, { "entropy": 0.030821244430044317, "epoch": 2.687404925152671, "grad_norm": 1.4453125, "learning_rate": 5.644391889603951e-06, "loss": 0.03456997871398926, "mean_token_accuracy": 0.9896837142109871, "num_tokens": 32182672.0, "step": 30475 }, { "entropy": 0.022968131275520137, "epoch": 2.6896095592936353, "grad_norm": 1.0078125, "learning_rate": 5.565828360873226e-06, "loss": 0.03449763298034668, "mean_token_accuracy": 0.9907573989033699, "num_tokens": 32210056.0, "step": 30500 }, { "entropy": 0.021756964745400183, "epoch": 2.6918141934345994, "grad_norm": 0.80078125, "learning_rate": 5.487799774730395e-06, "loss": 0.02377518653869629, "mean_token_accuracy": 0.9940974757075309, "num_tokens": 32235991.0, "step": 30525 }, { "entropy": 0.02951632114707536, "epoch": 2.694018827575564, "grad_norm": 1.734375, "learning_rate": 5.410306573185109e-06, "loss": 0.03300446033477783, "mean_token_accuracy": 0.9902588561177254, "num_tokens": 32262850.0, "step": 30550 }, { "entropy": 0.02724777233459463, "epoch": 2.696223461716528, "grad_norm": 0.65234375, "learning_rate": 5.333349195214166e-06, "loss": 0.03714723348617554, "mean_token_accuracy": 0.9866947430372238, "num_tokens": 32290242.0, "step": 30575 }, { "entropy": 0.02796483456815622, "epoch": 2.6984280958574924, "grad_norm": 0.16015625, "learning_rate": 5.256928076759115e-06, "loss": 0.033420143127441404, "mean_token_accuracy": 0.9920667752623558, "num_tokens": 32315628.0, "step": 30600 }, { "entropy": 0.02718132355395937, "epoch": 2.700632729998457, "grad_norm": 1.21875, "learning_rate": 5.181043650723761e-06, "loss": 0.03775918960571289, "mean_token_accuracy": 0.990137320458889, "num_tokens": 32341921.0, "step": 30625 }, { "entropy": 0.02674400442670958, "epoch": 2.702837364139421, "grad_norm": 0.58984375, "learning_rate": 5.105696346971678e-06, "loss": 0.032031383514404294, "mean_token_accuracy": 0.9912699130177498, "num_tokens": 32368912.0, "step": 30650 }, { "entropy": 0.03207970959640079, "epoch": 2.7050419982803855, "grad_norm": 0.96484375, "learning_rate": 5.030886592323825e-06, "loss": 0.03718382120132446, "mean_token_accuracy": 0.9903065833449364, "num_tokens": 32396094.0, "step": 30675 }, { "entropy": 0.031730443536853274, "epoch": 2.7072466324213496, "grad_norm": 0.55859375, "learning_rate": 4.956614810556038e-06, "loss": 0.04166868686676026, "mean_token_accuracy": 0.9892936205863953, "num_tokens": 32422076.0, "step": 30700 }, { "entropy": 0.029964850165561073, "epoch": 2.709451266562314, "grad_norm": 0.69921875, "learning_rate": 4.882881422396768e-06, "loss": 0.02965547323226929, "mean_token_accuracy": 0.992727922797203, "num_tokens": 32448492.0, "step": 30725 }, { "entropy": 0.025802147214999422, "epoch": 2.7116559007032786, "grad_norm": 0.73828125, "learning_rate": 4.809686845524553e-06, "loss": 0.028469092845916748, "mean_token_accuracy": 0.986219149529934, "num_tokens": 32475202.0, "step": 30750 }, { "entropy": 0.02441559437660544, "epoch": 2.7138605348442426, "grad_norm": 0.54296875, "learning_rate": 4.73703149456578e-06, "loss": 0.024286515712738037, "mean_token_accuracy": 0.9930135196447373, "num_tokens": 32500990.0, "step": 30775 }, { "entropy": 0.02470657176872919, "epoch": 2.7160651689852067, "grad_norm": 0.125, "learning_rate": 4.664915781092227e-06, "loss": 0.027780747413635253, "mean_token_accuracy": 0.9930829498171806, "num_tokens": 32527121.0, "step": 30800 }, { "entropy": 0.02191462902490457, "epoch": 2.718269803126171, "grad_norm": 2.015625, "learning_rate": 4.593340113618783e-06, "loss": 0.028052380084991457, "mean_token_accuracy": 0.9934657236933708, "num_tokens": 32554159.0, "step": 30825 }, { "entropy": 0.030870509027117805, "epoch": 2.7204744372671357, "grad_norm": 1.0546875, "learning_rate": 4.522304897601149e-06, "loss": 0.03652215957641602, "mean_token_accuracy": 0.9900946572422982, "num_tokens": 32582393.0, "step": 30850 }, { "entropy": 0.02534176224733528, "epoch": 2.7226790714080997, "grad_norm": 1.8359375, "learning_rate": 4.451810535433532e-06, "loss": 0.034189610481262206, "mean_token_accuracy": 0.9922301995754242, "num_tokens": 32608359.0, "step": 30875 }, { "entropy": 0.022799009535629012, "epoch": 2.7248837055490642, "grad_norm": 0.875, "learning_rate": 4.381857426446323e-06, "loss": 0.0269643759727478, "mean_token_accuracy": 0.992847872376442, "num_tokens": 32633993.0, "step": 30900 }, { "entropy": 0.025321687431678584, "epoch": 2.7270883396900283, "grad_norm": 1.5625, "learning_rate": 4.312445966903911e-06, "loss": 0.033355269432067874, "mean_token_accuracy": 0.9916208279132843, "num_tokens": 32660746.0, "step": 30925 }, { "entropy": 0.02630714648788853, "epoch": 2.729292973830993, "grad_norm": 1.28125, "learning_rate": 4.243576550002348e-06, "loss": 0.03572848796844483, "mean_token_accuracy": 0.9908462983369827, "num_tokens": 32687066.0, "step": 30950 }, { "entropy": 0.02801416463107671, "epoch": 2.7314976079719573, "grad_norm": 1.9765625, "learning_rate": 4.175249565867212e-06, "loss": 0.03111718416213989, "mean_token_accuracy": 0.9914992889761924, "num_tokens": 32714187.0, "step": 30975 }, { "entropy": 0.025514190020621753, "epoch": 2.7337022421129213, "grad_norm": 1.65625, "learning_rate": 4.107465401551347e-06, "loss": 0.02301668405532837, "mean_token_accuracy": 0.9932084521651268, "num_tokens": 32739302.0, "step": 31000 }, { "epoch": 2.7337022421129213, "eval_entropy": 0.015138276679110206, "eval_loss": 0.024185428395867348, "eval_mean_token_accuracy": 0.9928067592128975, "eval_num_tokens": 32739302.0, "eval_runtime": 243.8232, "eval_samples_per_second": 16.114, "eval_steps_per_second": 4.032, "step": 31000 }, { "entropy": 0.02361316845028341, "epoch": 2.7359068762538854, "grad_norm": 1.0234375, "learning_rate": 4.040224441032647e-06, "loss": 0.02711350440979004, "mean_token_accuracy": 0.9928304886817932, "num_tokens": 32764350.0, "step": 31025 }, { "entropy": 0.0276046938167201, "epoch": 2.73811151039485, "grad_norm": 2.421875, "learning_rate": 3.973527065211924e-06, "loss": 0.034292078018188475, "mean_token_accuracy": 0.9921803194284439, "num_tokens": 32790544.0, "step": 31050 }, { "entropy": 0.02835739084504894, "epoch": 2.7403161445358144, "grad_norm": 0.796875, "learning_rate": 3.9073736519107615e-06, "loss": 0.030812058448791504, "mean_token_accuracy": 0.9924408486485481, "num_tokens": 32817371.0, "step": 31075 }, { "entropy": 0.026140070521105372, "epoch": 2.7425207786767785, "grad_norm": 0.287109375, "learning_rate": 3.841764575869356e-06, "loss": 0.03151321649551392, "mean_token_accuracy": 0.9925504148006439, "num_tokens": 32844316.0, "step": 31100 }, { "entropy": 0.021448419220723734, "epoch": 2.744725412817743, "grad_norm": 1.6796875, "learning_rate": 3.7767002087443214e-06, "loss": 0.027071454524993897, "mean_token_accuracy": 0.9938805449008942, "num_tokens": 32869921.0, "step": 31125 }, { "entropy": 0.027382947859623526, "epoch": 2.746930046958707, "grad_norm": 2.03125, "learning_rate": 3.7121809191067225e-06, "loss": 0.034256911277770995, "mean_token_accuracy": 0.9918771433830261, "num_tokens": 32895789.0, "step": 31150 }, { "entropy": 0.025478750003985626, "epoch": 2.7491346810996715, "grad_norm": 2.65625, "learning_rate": 3.6482070724399022e-06, "loss": 0.036122303009033206, "mean_token_accuracy": 0.9917540955543518, "num_tokens": 32921732.0, "step": 31175 }, { "entropy": 0.030465085982468734, "epoch": 2.751339315240636, "grad_norm": 0.265625, "learning_rate": 3.5847790311374085e-06, "loss": 0.035727314949035645, "mean_token_accuracy": 0.9871862959861756, "num_tokens": 32949378.0, "step": 31200 }, { "entropy": 0.02002370489099121, "epoch": 2.7535439493816, "grad_norm": 1.0546875, "learning_rate": 3.521897154500953e-06, "loss": 0.016289963722229003, "mean_token_accuracy": 0.9956188550591469, "num_tokens": 32975624.0, "step": 31225 }, { "entropy": 0.030022653135565635, "epoch": 2.7557485835225646, "grad_norm": 1.390625, "learning_rate": 3.4595617987384086e-06, "loss": 0.034364793300628665, "mean_token_accuracy": 0.9914705500006675, "num_tokens": 33002241.0, "step": 31250 }, { "entropy": 0.02831969540686259, "epoch": 2.7579532176635286, "grad_norm": 0.271484375, "learning_rate": 3.3977733169617276e-06, "loss": 0.03397629976272583, "mean_token_accuracy": 0.9931815955042839, "num_tokens": 33029769.0, "step": 31275 }, { "entropy": 0.024141516995041457, "epoch": 2.760157851804493, "grad_norm": 0.55078125, "learning_rate": 3.336532059185016e-06, "loss": 0.020577189922332765, "mean_token_accuracy": 0.9933001267910003, "num_tokens": 33054886.0, "step": 31300 }, { "entropy": 0.028197366523891106, "epoch": 2.762362485945457, "grad_norm": 3.171875, "learning_rate": 3.275838372322482e-06, "loss": 0.0250828218460083, "mean_token_accuracy": 0.99329480946064, "num_tokens": 33082486.0, "step": 31325 }, { "entropy": 0.025448178426740922, "epoch": 2.7645671200864217, "grad_norm": 0.271484375, "learning_rate": 3.215692600186504e-06, "loss": 0.02443751573562622, "mean_token_accuracy": 0.9934353199601174, "num_tokens": 33108507.0, "step": 31350 }, { "entropy": 0.024398964844294824, "epoch": 2.7667717542273857, "grad_norm": 0.63671875, "learning_rate": 3.1560950834856995e-06, "loss": 0.023592844009399414, "mean_token_accuracy": 0.9932548335194588, "num_tokens": 33135112.0, "step": 31375 }, { "entropy": 0.02761033813996619, "epoch": 2.7689763883683502, "grad_norm": 2.453125, "learning_rate": 3.0970461598229806e-06, "loss": 0.030847842693328856, "mean_token_accuracy": 0.9923774653673172, "num_tokens": 33161778.0, "step": 31400 }, { "entropy": 0.023629968942695995, "epoch": 2.7711810225093148, "grad_norm": 0.79296875, "learning_rate": 3.0385461636935896e-06, "loss": 0.02890580177307129, "mean_token_accuracy": 0.9930263885855675, "num_tokens": 33187952.0, "step": 31425 }, { "entropy": 0.031088588998973137, "epoch": 2.773385656650279, "grad_norm": 0.05126953125, "learning_rate": 2.9805954264832903e-06, "loss": 0.039121434688568116, "mean_token_accuracy": 0.990178719162941, "num_tokens": 33214650.0, "step": 31450 }, { "entropy": 0.025570228301403403, "epoch": 2.7755902907912433, "grad_norm": 1.96875, "learning_rate": 2.9231942764664566e-06, "loss": 0.034187076091766355, "mean_token_accuracy": 0.9918478041887283, "num_tokens": 33241014.0, "step": 31475 }, { "entropy": 0.027400689849127958, "epoch": 2.7777949249322074, "grad_norm": 1.0, "learning_rate": 2.8663430388041977e-06, "loss": 0.027099764347076415, "mean_token_accuracy": 0.9917295035719872, "num_tokens": 33268017.0, "step": 31500 }, { "entropy": 0.03041357980109751, "epoch": 2.779999559073172, "grad_norm": 0.1142578125, "learning_rate": 2.8100420355424927e-06, "loss": 0.03652872085571289, "mean_token_accuracy": 0.9906187650561332, "num_tokens": 33295932.0, "step": 31525 }, { "entropy": 0.023346892858407953, "epoch": 2.7822041932141364, "grad_norm": 1.2890625, "learning_rate": 2.754291585610458e-06, "loss": 0.02764610290527344, "mean_token_accuracy": 0.9924876546859741, "num_tokens": 33321774.0, "step": 31550 }, { "entropy": 0.027840503481929773, "epoch": 2.7844088273551004, "grad_norm": 0.1171875, "learning_rate": 2.699092004818449e-06, "loss": 0.03487974882125854, "mean_token_accuracy": 0.9911468136310577, "num_tokens": 33347870.0, "step": 31575 }, { "entropy": 0.02395780631508387, "epoch": 2.7866134614960645, "grad_norm": 0.3984375, "learning_rate": 2.6444436058563284e-06, "loss": 0.030053725242614748, "mean_token_accuracy": 0.9926697811484337, "num_tokens": 33373842.0, "step": 31600 }, { "entropy": 0.03379408556993439, "epoch": 2.788818095637029, "grad_norm": 1.34375, "learning_rate": 2.5903466982916235e-06, "loss": 0.040092172622680666, "mean_token_accuracy": 0.9891455551981926, "num_tokens": 33400897.0, "step": 31625 }, { "entropy": 0.026423628384582117, "epoch": 2.7910227297779935, "grad_norm": 0.54296875, "learning_rate": 2.536801588567861e-06, "loss": 0.030896174907684325, "mean_token_accuracy": 0.9924877586960793, "num_tokens": 33427390.0, "step": 31650 }, { "entropy": 0.025224947988645, "epoch": 2.7932273639189575, "grad_norm": 0.384765625, "learning_rate": 2.4838085800028e-06, "loss": 0.030030651092529295, "mean_token_accuracy": 0.9931579613685608, "num_tokens": 33452449.0, "step": 31675 }, { "entropy": 0.02849430440346623, "epoch": 2.795431998059922, "grad_norm": 2.15625, "learning_rate": 2.431367972786669e-06, "loss": 0.03065213918685913, "mean_token_accuracy": 0.9911214289069176, "num_tokens": 33480216.0, "step": 31700 }, { "entropy": 0.023187637610408273, "epoch": 2.797636632200886, "grad_norm": 1.0546875, "learning_rate": 2.3794800639805326e-06, "loss": 0.028720135688781737, "mean_token_accuracy": 0.9924741345643997, "num_tokens": 33505655.0, "step": 31725 }, { "entropy": 0.028415085796841596, "epoch": 2.7998412663418506, "grad_norm": 1.6484375, "learning_rate": 2.3281451475145265e-06, "loss": 0.035433664321899414, "mean_token_accuracy": 0.9908634713292122, "num_tokens": 33533126.0, "step": 31750 }, { "entropy": 0.024498703280369228, "epoch": 2.802045900482815, "grad_norm": 0.67578125, "learning_rate": 2.2773635141863146e-06, "loss": 0.034044148921966555, "mean_token_accuracy": 0.9911482638120651, "num_tokens": 33560293.0, "step": 31775 }, { "entropy": 0.022807922580341256, "epoch": 2.804250534623779, "grad_norm": 0.66796875, "learning_rate": 2.2271354516593345e-06, "loss": 0.026914541721343995, "mean_token_accuracy": 0.9931732827425003, "num_tokens": 33586912.0, "step": 31800 }, { "entropy": 0.028503268719650806, "epoch": 2.8064551687647437, "grad_norm": 1.703125, "learning_rate": 2.1774612444611763e-06, "loss": 0.0329655909538269, "mean_token_accuracy": 0.9907631108164787, "num_tokens": 33613367.0, "step": 31825 }, { "entropy": 0.03049155484219227, "epoch": 2.8086598029057077, "grad_norm": 0.8515625, "learning_rate": 2.128341173982029e-06, "loss": 0.03732057809829712, "mean_token_accuracy": 0.9890658557415009, "num_tokens": 33640305.0, "step": 31850 }, { "entropy": 0.023213110088054236, "epoch": 2.810864437046672, "grad_norm": 0.71875, "learning_rate": 2.0797755184730703e-06, "loss": 0.02541722059249878, "mean_token_accuracy": 0.9929253304004669, "num_tokens": 33665942.0, "step": 31875 }, { "entropy": 0.029790412700567686, "epoch": 2.8130690711876363, "grad_norm": 1.34375, "learning_rate": 2.031764553044846e-06, "loss": 0.033856201171875, "mean_token_accuracy": 0.9906247487664223, "num_tokens": 33693648.0, "step": 31900 }, { "entropy": 0.025559156673880354, "epoch": 2.8152737053286008, "grad_norm": 0.2041015625, "learning_rate": 1.9843085496657363e-06, "loss": 0.024002790451049805, "mean_token_accuracy": 0.9935007789731025, "num_tokens": 33719559.0, "step": 31925 }, { "entropy": 0.030051562321277744, "epoch": 2.817478339469565, "grad_norm": 1.1484375, "learning_rate": 1.9374077771604717e-06, "loss": 0.034890377521514894, "mean_token_accuracy": 0.9901774021983146, "num_tokens": 33745656.0, "step": 31950 }, { "entropy": 0.029569217341013428, "epoch": 2.8196829736105293, "grad_norm": 0.7578125, "learning_rate": 1.8910625012084849e-06, "loss": 0.045575013160705564, "mean_token_accuracy": 0.9892233127355575, "num_tokens": 33773069.0, "step": 31975 }, { "entropy": 0.0297723802717519, "epoch": 2.821887607751494, "grad_norm": 0.3984375, "learning_rate": 1.8452729843425166e-06, "loss": 0.031798024177551266, "mean_token_accuracy": 0.9912262725830078, "num_tokens": 33799323.0, "step": 32000 }, { "epoch": 2.821887607751494, "eval_entropy": 0.01515091881442565, "eval_loss": 0.024167899042367935, "eval_mean_token_accuracy": 0.9927796902758422, "eval_num_tokens": 33799323.0, "eval_runtime": 224.6709, "eval_samples_per_second": 17.488, "eval_steps_per_second": 4.375, "step": 32000 }, { "entropy": 0.03310864323779242, "epoch": 2.824092241892458, "grad_norm": 1.5703125, "learning_rate": 1.8000394859471248e-06, "loss": 0.03980295896530151, "mean_token_accuracy": 0.9912154677510262, "num_tokens": 33826782.0, "step": 32025 }, { "entropy": 0.025057866033166648, "epoch": 2.8262968760334224, "grad_norm": 1.2109375, "learning_rate": 1.7553622622571097e-06, "loss": 0.027699394226074217, "mean_token_accuracy": 0.9926194402575493, "num_tokens": 33854056.0, "step": 32050 }, { "entropy": 0.02458085154845321, "epoch": 2.8285015101743864, "grad_norm": 0.18359375, "learning_rate": 1.7112415663562032e-06, "loss": 0.032873761653900144, "mean_token_accuracy": 0.9916874733567238, "num_tokens": 33879518.0, "step": 32075 }, { "entropy": 0.03164592763743713, "epoch": 2.830706144315351, "grad_norm": 0.90625, "learning_rate": 1.667677648175503e-06, "loss": 0.03423006057739258, "mean_token_accuracy": 0.9900494894385338, "num_tokens": 33906134.0, "step": 32100 }, { "entropy": 0.02424717899073585, "epoch": 2.8329107784563154, "grad_norm": 1.65625, "learning_rate": 1.624670754492197e-06, "loss": 0.027348561286926268, "mean_token_accuracy": 0.9924674391746521, "num_tokens": 33931392.0, "step": 32125 }, { "entropy": 0.024996388291965557, "epoch": 2.8351154125972795, "grad_norm": 2.28125, "learning_rate": 1.5822211289280077e-06, "loss": 0.027897491455078124, "mean_token_accuracy": 0.9916456484794617, "num_tokens": 33957012.0, "step": 32150 }, { "entropy": 0.02605572233511339, "epoch": 2.8373200467382436, "grad_norm": 2.5625, "learning_rate": 1.5403290119479275e-06, "loss": 0.0301747727394104, "mean_token_accuracy": 0.9904224568605423, "num_tokens": 33982679.0, "step": 32175 }, { "entropy": 0.025316978948030738, "epoch": 2.839524680879208, "grad_norm": 1.03125, "learning_rate": 1.4989946408588419e-06, "loss": 0.03385440587997437, "mean_token_accuracy": 0.9931662768125534, "num_tokens": 34009263.0, "step": 32200 }, { "entropy": 0.02666074657916397, "epoch": 2.8417293150201726, "grad_norm": 2.46875, "learning_rate": 1.4582182498081521e-06, "loss": 0.026826119422912596, "mean_token_accuracy": 0.9928840225934983, "num_tokens": 34035632.0, "step": 32225 }, { "entropy": 0.019749616278095346, "epoch": 2.8439339491611366, "grad_norm": 0.09521484375, "learning_rate": 1.4180000697824437e-06, "loss": 0.029020504951477052, "mean_token_accuracy": 0.9939252683520317, "num_tokens": 34059537.0, "step": 32250 }, { "entropy": 0.025551201240523368, "epoch": 2.846138583302101, "grad_norm": 1.65625, "learning_rate": 1.378340328606198e-06, "loss": 0.02985560417175293, "mean_token_accuracy": 0.9924237194657326, "num_tokens": 34085547.0, "step": 32275 }, { "entropy": 0.02937892486921555, "epoch": 2.848343217443065, "grad_norm": 1.9921875, "learning_rate": 1.3392392509405383e-06, "loss": 0.03745560646057129, "mean_token_accuracy": 0.9910514345765113, "num_tokens": 34111234.0, "step": 32300 }, { "entropy": 0.0265326060807638, "epoch": 2.8505478515840297, "grad_norm": 0.0986328125, "learning_rate": 1.3006970582818856e-06, "loss": 0.03232823848724365, "mean_token_accuracy": 0.9928913420438766, "num_tokens": 34138146.0, "step": 32325 }, { "entropy": 0.02657694019340852, "epoch": 2.852752485724994, "grad_norm": 1.5703125, "learning_rate": 1.262713968960738e-06, "loss": 0.032440063953399656, "mean_token_accuracy": 0.9916506347060203, "num_tokens": 34163976.0, "step": 32350 }, { "entropy": 0.02390991311860489, "epoch": 2.8549571198659582, "grad_norm": 0.1865234375, "learning_rate": 1.2252901981404384e-06, "loss": 0.028550803661346436, "mean_token_accuracy": 0.9930156728625298, "num_tokens": 34189547.0, "step": 32375 }, { "entropy": 0.03081292722563376, "epoch": 2.8571617540069223, "grad_norm": 1.7578125, "learning_rate": 1.188425957815953e-06, "loss": 0.040956239700317386, "mean_token_accuracy": 0.99051443785429, "num_tokens": 34216591.0, "step": 32400 }, { "entropy": 0.025143344738498854, "epoch": 2.859366388147887, "grad_norm": 0.5703125, "learning_rate": 1.1521214568126714e-06, "loss": 0.026441171169281005, "mean_token_accuracy": 0.9920197981595993, "num_tokens": 34243141.0, "step": 32425 }, { "entropy": 0.02643716373735515, "epoch": 2.8615710222888513, "grad_norm": 0.78125, "learning_rate": 1.1163769007851988e-06, "loss": 0.02717090606689453, "mean_token_accuracy": 0.9935029128193855, "num_tokens": 34270532.0, "step": 32450 }, { "entropy": 0.02689198448573734, "epoch": 2.8637756564298154, "grad_norm": 0.59765625, "learning_rate": 1.081192492216243e-06, "loss": 0.02757613182067871, "mean_token_accuracy": 0.9925964233279229, "num_tokens": 34297695.0, "step": 32475 }, { "entropy": 0.027454499909836158, "epoch": 2.86598029057078, "grad_norm": 0.65234375, "learning_rate": 1.0465684304154067e-06, "loss": 0.0293727707862854, "mean_token_accuracy": 0.9919403794407845, "num_tokens": 34324699.0, "step": 32500 }, { "entropy": 0.03476064759888686, "epoch": 2.868184924711744, "grad_norm": 2.3125, "learning_rate": 1.0125049115181196e-06, "loss": 0.0358671760559082, "mean_token_accuracy": 0.9892446520924568, "num_tokens": 34353069.0, "step": 32525 }, { "entropy": 0.028821960146378844, "epoch": 2.8703895588527084, "grad_norm": 3.25, "learning_rate": 9.79002128484463e-07, "loss": 0.03103492021560669, "mean_token_accuracy": 0.99189660936594, "num_tokens": 34378573.0, "step": 32550 }, { "entropy": 0.03503867352499583, "epoch": 2.872594192993673, "grad_norm": 0.8125, "learning_rate": 9.460602710981259e-07, "loss": 0.04088228225708008, "mean_token_accuracy": 0.9876779773831368, "num_tokens": 34404264.0, "step": 32575 }, { "entropy": 0.026292509975028223, "epoch": 2.874798827134637, "grad_norm": 0.302734375, "learning_rate": 9.136795259653386e-07, "loss": 0.027711949348449706, "mean_token_accuracy": 0.9927157282829284, "num_tokens": 34429879.0, "step": 32600 }, { "entropy": 0.032493346882838524, "epoch": 2.8770034612756015, "grad_norm": 1.4375, "learning_rate": 8.818600765137408e-07, "loss": 0.03947168827056885, "mean_token_accuracy": 0.9898266091942787, "num_tokens": 34457557.0, "step": 32625 }, { "entropy": 0.03031579275168042, "epoch": 2.8792080954165655, "grad_norm": 0.384765625, "learning_rate": 8.506021029914157e-07, "loss": 0.034112751483917236, "mean_token_accuracy": 0.990267367362976, "num_tokens": 34484320.0, "step": 32650 }, { "entropy": 0.025231020506616915, "epoch": 2.88141272955753, "grad_norm": 1.328125, "learning_rate": 8.199057824658574e-07, "loss": 0.02583768844604492, "mean_token_accuracy": 0.9929495406150818, "num_tokens": 34509823.0, "step": 32675 }, { "entropy": 0.028853209191984207, "epoch": 2.883617363698494, "grad_norm": 1.8515625, "learning_rate": 7.897712888229491e-07, "loss": 0.0317841649055481, "mean_token_accuracy": 0.9915751847624779, "num_tokens": 34536481.0, "step": 32700 }, { "entropy": 0.03166955886284995, "epoch": 2.8858219978394586, "grad_norm": 0.62109375, "learning_rate": 7.60198792765987e-07, "loss": 0.039300172328948973, "mean_token_accuracy": 0.9904856544733047, "num_tokens": 34563378.0, "step": 32725 }, { "entropy": 0.023482872382519417, "epoch": 2.8880266319804226, "grad_norm": 0.404296875, "learning_rate": 7.311884618147025e-07, "loss": 0.03237533330917358, "mean_token_accuracy": 0.9908891406655311, "num_tokens": 34588598.0, "step": 32750 }, { "entropy": 0.025234425218295654, "epoch": 2.890231266121387, "grad_norm": 1.203125, "learning_rate": 7.027404603043186e-07, "loss": 0.030891218185424806, "mean_token_accuracy": 0.9913925004005432, "num_tokens": 34615337.0, "step": 32775 }, { "entropy": 0.027402053751684433, "epoch": 2.8924359002623516, "grad_norm": 0.123046875, "learning_rate": 6.748549493846513e-07, "loss": 0.04195026397705078, "mean_token_accuracy": 0.9898461005091668, "num_tokens": 34642678.0, "step": 32800 }, { "entropy": 0.023586020493894466, "epoch": 2.8946405344033157, "grad_norm": 2.53125, "learning_rate": 6.475320870191315e-07, "loss": 0.026862666606903077, "mean_token_accuracy": 0.991207799911499, "num_tokens": 34667892.0, "step": 32825 }, { "entropy": 0.025827234457919984, "epoch": 2.89684516854428, "grad_norm": 2.359375, "learning_rate": 6.207720279839735e-07, "loss": 0.03438200950622559, "mean_token_accuracy": 0.9910271978378296, "num_tokens": 34694059.0, "step": 32850 }, { "entropy": 0.02829796871297731, "epoch": 2.8990498026852443, "grad_norm": 0.1923828125, "learning_rate": 5.945749238672527e-07, "loss": 0.031097068786621093, "mean_token_accuracy": 0.9911362925171852, "num_tokens": 34719150.0, "step": 32875 }, { "entropy": 0.024490521577972685, "epoch": 2.9012544368262088, "grad_norm": 0.2578125, "learning_rate": 5.689409230680843e-07, "loss": 0.027147409915924074, "mean_token_accuracy": 0.9921639716625213, "num_tokens": 34744016.0, "step": 32900 }, { "entropy": 0.029058779828665136, "epoch": 2.9034590709671733, "grad_norm": 0.671875, "learning_rate": 5.438701707957572e-07, "loss": 0.03907189846038819, "mean_token_accuracy": 0.9910078606009484, "num_tokens": 34771166.0, "step": 32925 }, { "entropy": 0.02809664033673471, "epoch": 2.9056637051081373, "grad_norm": 1.203125, "learning_rate": 5.193628090689018e-07, "loss": 0.03171214580535889, "mean_token_accuracy": 0.9920984748005867, "num_tokens": 34796092.0, "step": 32950 }, { "entropy": 0.024929613195490676, "epoch": 2.9078683392491014, "grad_norm": 0.2236328125, "learning_rate": 4.954189767147344e-07, "loss": 0.029733285903930665, "mean_token_accuracy": 0.9918975239992142, "num_tokens": 34822238.0, "step": 32975 }, { "entropy": 0.02411046184010047, "epoch": 2.910072973390066, "grad_norm": 0.1787109375, "learning_rate": 4.7203880936821375e-07, "loss": 0.03335120916366577, "mean_token_accuracy": 0.9924368995428086, "num_tokens": 34848162.0, "step": 33000 }, { "epoch": 2.910072973390066, "eval_entropy": 0.01513221942192457, "eval_loss": 0.02417258359491825, "eval_mean_token_accuracy": 0.9928157675787731, "eval_num_tokens": 34848162.0, "eval_runtime": 230.2757, "eval_samples_per_second": 17.062, "eval_steps_per_second": 4.269, "step": 33000 }, { "entropy": 0.028706058168027085, "epoch": 2.9122776075310304, "grad_norm": 3.40625, "learning_rate": 4.492224394712974e-07, "loss": 0.04000510692596435, "mean_token_accuracy": 0.9895797765254974, "num_tokens": 34874800.0, "step": 33025 }, { "entropy": 0.0267985630390649, "epoch": 2.9144822416719944, "grad_norm": 2.984375, "learning_rate": 4.2696999627221956e-07, "loss": 0.03146414995193481, "mean_token_accuracy": 0.9921754291653633, "num_tokens": 34900617.0, "step": 33050 }, { "entropy": 0.030926660760123924, "epoch": 2.916686875812959, "grad_norm": 1.609375, "learning_rate": 4.052816058246811e-07, "loss": 0.03377273797988892, "mean_token_accuracy": 0.9911105188727379, "num_tokens": 34928321.0, "step": 33075 }, { "entropy": 0.020773140575256546, "epoch": 2.918891509953923, "grad_norm": 1.46875, "learning_rate": 3.841573909872387e-07, "loss": 0.022910866737365723, "mean_token_accuracy": 0.9938488656282425, "num_tokens": 34954288.0, "step": 33100 }, { "entropy": 0.0312850209017779, "epoch": 2.9210961440948875, "grad_norm": 0.384765625, "learning_rate": 3.6359747142251654e-07, "loss": 0.0410335636138916, "mean_token_accuracy": 0.9900199168920517, "num_tokens": 34981329.0, "step": 33125 }, { "entropy": 0.025324092398041103, "epoch": 2.923300778235852, "grad_norm": 0.71875, "learning_rate": 3.436019635965848e-07, "loss": 0.02742032051086426, "mean_token_accuracy": 0.9904520413279534, "num_tokens": 35006783.0, "step": 33150 }, { "entropy": 0.031204914556074072, "epoch": 2.925505412376816, "grad_norm": 0.640625, "learning_rate": 3.241709807782822e-07, "loss": 0.04399027347564697, "mean_token_accuracy": 0.9883441299200058, "num_tokens": 35033941.0, "step": 33175 }, { "entropy": 0.03261162290466018, "epoch": 2.92771004651778, "grad_norm": 0.3828125, "learning_rate": 3.053046330385723e-07, "loss": 0.03782075881958008, "mean_token_accuracy": 0.9898115587234497, "num_tokens": 35060891.0, "step": 33200 }, { "entropy": 0.03443793041369645, "epoch": 2.9299146806587446, "grad_norm": 1.2734375, "learning_rate": 2.8700302724992134e-07, "loss": 0.04203218460083008, "mean_token_accuracy": 0.9881391364336014, "num_tokens": 35089278.0, "step": 33225 }, { "entropy": 0.027772624024983088, "epoch": 2.932119314799709, "grad_norm": 2.484375, "learning_rate": 2.692662670856883e-07, "loss": 0.026545183658599855, "mean_token_accuracy": 0.9932020205259323, "num_tokens": 35115886.0, "step": 33250 }, { "entropy": 0.030104288711700063, "epoch": 2.934323948940673, "grad_norm": 0.9453125, "learning_rate": 2.520944530195579e-07, "loss": 0.04437981128692627, "mean_token_accuracy": 0.9900987917184829, "num_tokens": 35143409.0, "step": 33275 }, { "entropy": 0.02149091817529552, "epoch": 2.9365285830816377, "grad_norm": 3.171875, "learning_rate": 2.35487682324953e-07, "loss": 0.027275030612945558, "mean_token_accuracy": 0.9921983778476715, "num_tokens": 35169840.0, "step": 33300 }, { "entropy": 0.02388738744515649, "epoch": 2.9387332172226017, "grad_norm": 0.05810546875, "learning_rate": 2.1944604907446765e-07, "loss": 0.03441377639770508, "mean_token_accuracy": 0.9911992436647415, "num_tokens": 35196080.0, "step": 33325 }, { "entropy": 0.03426591103016108, "epoch": 2.9409378513635662, "grad_norm": 1.1640625, "learning_rate": 2.0396964413937903e-07, "loss": 0.04174567222595215, "mean_token_accuracy": 0.9897310450673104, "num_tokens": 35223861.0, "step": 33350 }, { "entropy": 0.030975585428168414, "epoch": 2.9431424855045307, "grad_norm": 1.7265625, "learning_rate": 1.890585551890811e-07, "loss": 0.03444629192352295, "mean_token_accuracy": 0.9910498291254044, "num_tokens": 35250459.0, "step": 33375 }, { "entropy": 0.030872389084070164, "epoch": 2.945347119645495, "grad_norm": 1.21875, "learning_rate": 1.747128666906517e-07, "loss": 0.045081048011779784, "mean_token_accuracy": 0.989509349167347, "num_tokens": 35277415.0, "step": 33400 }, { "entropy": 0.02479782605125365, "epoch": 2.9475517537864593, "grad_norm": 0.73828125, "learning_rate": 1.609326599083083e-07, "loss": 0.025031945705413818, "mean_token_accuracy": 0.9937001445889473, "num_tokens": 35303505.0, "step": 33425 }, { "entropy": 0.02384750459346833, "epoch": 2.9497563879274233, "grad_norm": 1.2734375, "learning_rate": 1.477180129029754e-07, "loss": 0.02797969341278076, "mean_token_accuracy": 0.9879170176386833, "num_tokens": 35329409.0, "step": 33450 }, { "entropy": 0.02900122652774371, "epoch": 2.951961022068388, "grad_norm": 0.984375, "learning_rate": 1.3506900053186223e-07, "loss": 0.04034067630767822, "mean_token_accuracy": 0.9863409793376923, "num_tokens": 35356547.0, "step": 33475 }, { "entropy": 0.028102036683012557, "epoch": 2.954165656209352, "grad_norm": 1.03125, "learning_rate": 1.229856944480079e-07, "loss": 0.03228505849838257, "mean_token_accuracy": 0.992490217089653, "num_tokens": 35382846.0, "step": 33500 }, { "entropy": 0.030634857748700596, "epoch": 2.9563702903503164, "grad_norm": 2.34375, "learning_rate": 1.1146816309987041e-07, "loss": 0.03508009195327759, "mean_token_accuracy": 0.9879120439291, "num_tokens": 35408950.0, "step": 33525 }, { "entropy": 0.023159925354211738, "epoch": 2.9585749244912805, "grad_norm": 0.7734375, "learning_rate": 1.0051647173099365e-07, "loss": 0.024101905822753907, "mean_token_accuracy": 0.9922597792744636, "num_tokens": 35434210.0, "step": 33550 }, { "entropy": 0.03500574579287786, "epoch": 2.960779558632245, "grad_norm": 0.515625, "learning_rate": 9.013068237956335e-08, "loss": 0.05052834987640381, "mean_token_accuracy": 0.9885596024990082, "num_tokens": 35461997.0, "step": 33575 }, { "entropy": 0.026359798516132286, "epoch": 2.9629841927732095, "grad_norm": 0.421875, "learning_rate": 8.031085387811832e-08, "loss": 0.03985478401184082, "mean_token_accuracy": 0.9904181951284409, "num_tokens": 35487065.0, "step": 33600 }, { "entropy": 0.02394567044406358, "epoch": 2.9651888269141735, "grad_norm": 0.56640625, "learning_rate": 7.105704185316197e-08, "loss": 0.025043725967407227, "mean_token_accuracy": 0.9922685399651527, "num_tokens": 35511639.0, "step": 33625 }, { "entropy": 0.024473328073963786, "epoch": 2.967393461055138, "grad_norm": 0.9921875, "learning_rate": 6.236929872491804e-08, "loss": 0.02385477304458618, "mean_token_accuracy": 0.9946402052044868, "num_tokens": 35538356.0, "step": 33650 }, { "entropy": 0.028815567944038775, "epoch": 2.969598095196102, "grad_norm": 0.3828125, "learning_rate": 5.424767370695305e-08, "loss": 0.034439117908477784, "mean_token_accuracy": 0.986783909201622, "num_tokens": 35565031.0, "step": 33675 }, { "entropy": 0.03250583879053011, "epoch": 2.9718027293370666, "grad_norm": 0.59375, "learning_rate": 4.6692212805965475e-08, "loss": 0.037513306140899656, "mean_token_accuracy": 0.9900922834873199, "num_tokens": 35592243.0, "step": 33700 }, { "entropy": 0.027665380477410507, "epoch": 2.974007363478031, "grad_norm": 0.30859375, "learning_rate": 3.9702958821463684e-08, "loss": 0.034902803897857666, "mean_token_accuracy": 0.9913255712389946, "num_tokens": 35618924.0, "step": 33725 }, { "entropy": 0.026782970013227894, "epoch": 2.976211997618995, "grad_norm": 0.259765625, "learning_rate": 3.3279951345577265e-08, "loss": 0.044511117935180665, "mean_token_accuracy": 0.9920956519246101, "num_tokens": 35645678.0, "step": 33750 }, { "entropy": 0.032550619621688384, "epoch": 2.978416631759959, "grad_norm": 0.076171875, "learning_rate": 2.7423226762812725e-08, "loss": 0.042112269401550294, "mean_token_accuracy": 0.9892310863733291, "num_tokens": 35673477.0, "step": 33775 }, { "entropy": 0.02943160523022016, "epoch": 2.9806212659009237, "grad_norm": 2.515625, "learning_rate": 2.213281824984259e-08, "loss": 0.03308338642120361, "mean_token_accuracy": 0.9916199234127998, "num_tokens": 35699973.0, "step": 33800 }, { "entropy": 0.030929924099600613, "epoch": 2.982825900041888, "grad_norm": 0.453125, "learning_rate": 1.740875577531664e-08, "loss": 0.04199523448944092, "mean_token_accuracy": 0.984671610891819, "num_tokens": 35727474.0, "step": 33825 }, { "entropy": 0.03162372545186372, "epoch": 2.9850305341828522, "grad_norm": 0.66796875, "learning_rate": 1.3251066099684295e-08, "loss": 0.03665184259414673, "mean_token_accuracy": 0.991539233326912, "num_tokens": 35754131.0, "step": 33850 }, { "entropy": 0.02623558983214025, "epoch": 2.9872351683238167, "grad_norm": 1.8671875, "learning_rate": 9.659772775094666e-09, "loss": 0.030340723991394043, "mean_token_accuracy": 0.9928580421209335, "num_tokens": 35780165.0, "step": 33875 }, { "entropy": 0.023927184478125127, "epoch": 2.989439802464781, "grad_norm": 0.06396484375, "learning_rate": 6.634896145185643e-09, "loss": 0.03702516794204712, "mean_token_accuracy": 0.9920265239477157, "num_tokens": 35805641.0, "step": 33900 }, { "entropy": 0.022169054430632967, "epoch": 2.9916444366057453, "grad_norm": 0.96875, "learning_rate": 4.176453345017262e-09, "loss": 0.025482571125030516, "mean_token_accuracy": 0.9918701857328415, "num_tokens": 35830790.0, "step": 33925 }, { "entropy": 0.025008084610890366, "epoch": 2.99384907074671, "grad_norm": 0.80859375, "learning_rate": 2.28445830096069e-09, "loss": 0.0239717435836792, "mean_token_accuracy": 0.9920974162220955, "num_tokens": 35856028.0, "step": 33950 }, { "entropy": 0.02687357971597521, "epoch": 2.996053704887674, "grad_norm": 0.10546875, "learning_rate": 9.589217306316123e-10, "loss": 0.03782797336578369, "mean_token_accuracy": 0.9907488691806793, "num_tokens": 35882072.0, "step": 33975 }, { "entropy": 0.025469130103374482, "epoch": 2.9982583390286384, "grad_norm": 1.6484375, "learning_rate": 1.9985114282361493e-10, "loss": 0.027287905216217042, "mean_token_accuracy": 0.9933070641756058, "num_tokens": 35907980.0, "step": 34000 }, { "epoch": 2.9982583390286384, "eval_entropy": 0.015145191631567904, "eval_loss": 0.024168651551008224, "eval_mean_token_accuracy": 0.9928109939925426, "eval_num_tokens": 35907980.0, "eval_runtime": 225.267, "eval_samples_per_second": 17.442, "eval_steps_per_second": 4.364, "step": 34000 }, { "epoch": 3.0, "eval_entropy": 0.015145191631567904, "eval_loss": 0.024168651551008224, "eval_mean_token_accuracy": 0.9928109939925426, "eval_num_tokens": 35928990.0, "eval_runtime": 225.9543, "eval_samples_per_second": 17.388, "eval_steps_per_second": 4.35, "step": 34020 } ], "logging_steps": 25, "max_steps": 34020, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.569718455428137e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }