diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14029 @@ +{ + "best_global_step": 27000, + "best_metric": 0.024163657799363136, + "best_model_checkpoint": "trainer_output/checkpoint-27000", + "epoch": 3.0, + "eval_steps": 1000, + "global_step": 34020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.6916703507304192, + "epoch": 8.818536563857227e-05, + "grad_norm": 15.375, + "learning_rate": 0.0, + "loss": 0.5625149607658386, + "mean_token_accuracy": 0.8633110001683235, + "num_tokens": 1333.0, + "step": 1 + }, + { + "entropy": 0.5527553335608294, + "epoch": 0.002204634140964307, + "grad_norm": 4.09375, + "learning_rate": 4.701273261508325e-06, + "loss": 0.4002639452616374, + "mean_token_accuracy": 0.8936887547994653, + "num_tokens": 26348.0, + "step": 25 + }, + { + "entropy": 0.15097233657725156, + "epoch": 0.004409268281928614, + "grad_norm": 3.046875, + "learning_rate": 9.598432908912831e-06, + "loss": 0.1661180877685547, + "mean_token_accuracy": 0.9615221408009529, + "num_tokens": 52419.0, + "step": 50 + }, + { + "entropy": 0.12569273573113604, + "epoch": 0.006613902422892921, + "grad_norm": 3.140625, + "learning_rate": 1.4495592556317337e-05, + "loss": 0.15316170692443848, + "mean_token_accuracy": 0.9623415350914002, + "num_tokens": 79151.0, + "step": 75 + }, + { + "entropy": 0.10584518638905138, + "epoch": 0.008818536563857228, + "grad_norm": 2.78125, + "learning_rate": 1.9392752203721843e-05, + "loss": 0.1182499122619629, + "mean_token_accuracy": 0.972541985809803, + "num_tokens": 105057.0, + "step": 100 + }, + { + "entropy": 0.08121318614110351, + "epoch": 0.011023170704821535, + "grad_norm": 4.25, + "learning_rate": 2.428991185112635e-05, + "loss": 0.10249818801879883, + "mean_token_accuracy": 0.9750828278064728, + "num_tokens": 131925.0, + "step": 125 + }, + { + "entropy": 0.0871189395734109, + "epoch": 0.013227804845785842, + "grad_norm": 1.8828125, + "learning_rate": 2.9187071498530854e-05, + "loss": 0.11519923210144042, + "mean_token_accuracy": 0.9748159709572792, + "num_tokens": 158543.0, + "step": 150 + }, + { + "entropy": 0.0863258442163351, + "epoch": 0.015432438986750148, + "grad_norm": 2.703125, + "learning_rate": 3.408423114593536e-05, + "loss": 0.11372153282165527, + "mean_token_accuracy": 0.9735086694359779, + "num_tokens": 185289.0, + "step": 175 + }, + { + "entropy": 0.06604936945135705, + "epoch": 0.017637073127714457, + "grad_norm": 2.359375, + "learning_rate": 3.8981390793339866e-05, + "loss": 0.08210546493530274, + "mean_token_accuracy": 0.9802239489555359, + "num_tokens": 210755.0, + "step": 200 + }, + { + "entropy": 0.08488065572455525, + "epoch": 0.019841707268678763, + "grad_norm": 1.0, + "learning_rate": 4.387855044074437e-05, + "loss": 0.09681215286254882, + "mean_token_accuracy": 0.9729325622320175, + "num_tokens": 236918.0, + "step": 225 + }, + { + "entropy": 0.08888421370531432, + "epoch": 0.02204634140964307, + "grad_norm": 2.5, + "learning_rate": 4.877571008814887e-05, + "loss": 0.12263742446899414, + "mean_token_accuracy": 0.9723492181301117, + "num_tokens": 263894.0, + "step": 250 + }, + { + "entropy": 0.081377989416942, + "epoch": 0.024250975550607377, + "grad_norm": 2.15625, + "learning_rate": 5.367286973555338e-05, + "loss": 0.10161479949951172, + "mean_token_accuracy": 0.9720952340960503, + "num_tokens": 289789.0, + "step": 275 + }, + { + "entropy": 0.0684462244680617, + "epoch": 0.026455609691571683, + "grad_norm": 1.96875, + "learning_rate": 5.857002938295789e-05, + "loss": 0.08587800979614257, + "mean_token_accuracy": 0.9774800541996956, + "num_tokens": 315300.0, + "step": 300 + }, + { + "entropy": 0.08871725924720522, + "epoch": 0.02866024383253599, + "grad_norm": 1.1640625, + "learning_rate": 6.346718903036239e-05, + "loss": 0.10993636131286622, + "mean_token_accuracy": 0.9720117238163948, + "num_tokens": 341328.0, + "step": 325 + }, + { + "entropy": 0.06815292302024317, + "epoch": 0.030864877973500297, + "grad_norm": 1.7890625, + "learning_rate": 6.83643486777669e-05, + "loss": 0.09177898406982422, + "mean_token_accuracy": 0.9783439686894417, + "num_tokens": 367118.0, + "step": 350 + }, + { + "entropy": 0.08387046866118908, + "epoch": 0.03306951211446461, + "grad_norm": 0.419921875, + "learning_rate": 7.32615083251714e-05, + "loss": 0.11903386116027832, + "mean_token_accuracy": 0.9728097701072693, + "num_tokens": 393500.0, + "step": 375 + }, + { + "entropy": 0.08728675234102411, + "epoch": 0.03527414625542891, + "grad_norm": 1.859375, + "learning_rate": 7.81586679725759e-05, + "loss": 0.12638781547546388, + "mean_token_accuracy": 0.9733547866344452, + "num_tokens": 419948.0, + "step": 400 + }, + { + "entropy": 0.09208631370478543, + "epoch": 0.03747878039639322, + "grad_norm": 2.484375, + "learning_rate": 8.305582761998043e-05, + "loss": 0.13350732803344725, + "mean_token_accuracy": 0.9699632921814918, + "num_tokens": 446845.0, + "step": 425 + }, + { + "entropy": 0.10303217243927065, + "epoch": 0.03968341453735753, + "grad_norm": 3.65625, + "learning_rate": 8.795298726738492e-05, + "loss": 0.15950148582458495, + "mean_token_accuracy": 0.966689610183239, + "num_tokens": 474184.0, + "step": 450 + }, + { + "entropy": 0.10971505879715551, + "epoch": 0.04188804867832183, + "grad_norm": 1.90625, + "learning_rate": 9.285014691478942e-05, + "loss": 0.14567254066467286, + "mean_token_accuracy": 0.9634886494278908, + "num_tokens": 502243.0, + "step": 475 + }, + { + "entropy": 0.1019335692783352, + "epoch": 0.04409268281928614, + "grad_norm": 1.71875, + "learning_rate": 9.774730656219394e-05, + "loss": 0.12678200721740723, + "mean_token_accuracy": 0.9698567974567414, + "num_tokens": 528691.0, + "step": 500 + }, + { + "entropy": 0.11618134506978095, + "epoch": 0.04629731696025045, + "grad_norm": 2.484375, + "learning_rate": 0.00010264446620959845, + "loss": 0.1609012222290039, + "mean_token_accuracy": 0.9640674701333046, + "num_tokens": 557262.0, + "step": 525 + }, + { + "entropy": 0.08170637517177966, + "epoch": 0.04850195110121475, + "grad_norm": 3.671875, + "learning_rate": 0.00010754162585700293, + "loss": 0.11303130149841309, + "mean_token_accuracy": 0.9746711283922196, + "num_tokens": 583864.0, + "step": 550 + }, + { + "entropy": 0.08663091063848696, + "epoch": 0.05070658524217906, + "grad_norm": 1.7578125, + "learning_rate": 0.00011243878550440745, + "loss": 0.11758679389953614, + "mean_token_accuracy": 0.9716430401802063, + "num_tokens": 609742.0, + "step": 575 + }, + { + "entropy": 0.09322622302162926, + "epoch": 0.05291121938314337, + "grad_norm": 3.5, + "learning_rate": 0.00011733594515181196, + "loss": 0.14786581039428712, + "mean_token_accuracy": 0.9670667290687561, + "num_tokens": 635773.0, + "step": 600 + }, + { + "entropy": 0.12020757110440172, + "epoch": 0.05511585352410767, + "grad_norm": 2.46875, + "learning_rate": 0.00012223310479921644, + "loss": 0.1722104072570801, + "mean_token_accuracy": 0.963900217115879, + "num_tokens": 662038.0, + "step": 625 + }, + { + "entropy": 0.11677032057195902, + "epoch": 0.05732048766507198, + "grad_norm": 15.625, + "learning_rate": 0.00012713026444662097, + "loss": 0.14971059799194336, + "mean_token_accuracy": 0.9657747402787209, + "num_tokens": 687244.0, + "step": 650 + }, + { + "entropy": 0.11523487624712288, + "epoch": 0.05952512180603629, + "grad_norm": 1.9375, + "learning_rate": 0.00013202742409402547, + "loss": 0.1585102081298828, + "mean_token_accuracy": 0.9673979789018631, + "num_tokens": 714382.0, + "step": 675 + }, + { + "entropy": 0.11896388848195784, + "epoch": 0.06172975594700059, + "grad_norm": 3.5, + "learning_rate": 0.00013692458374142997, + "loss": 0.1676181411743164, + "mean_token_accuracy": 0.9640780803561211, + "num_tokens": 740592.0, + "step": 700 + }, + { + "entropy": 0.09957129561458715, + "epoch": 0.0639343900879649, + "grad_norm": 4.15625, + "learning_rate": 0.0001418217433888345, + "loss": 0.14969416618347167, + "mean_token_accuracy": 0.9703717887401581, + "num_tokens": 766145.0, + "step": 725 + }, + { + "entropy": 0.11554073191975477, + "epoch": 0.06613902422892921, + "grad_norm": 3.453125, + "learning_rate": 0.000146718903036239, + "loss": 0.15943048477172853, + "mean_token_accuracy": 0.9683528250455856, + "num_tokens": 792413.0, + "step": 750 + }, + { + "entropy": 0.11423591443977785, + "epoch": 0.06834365836989352, + "grad_norm": 9.25, + "learning_rate": 0.0001516160626836435, + "loss": 0.16937288284301757, + "mean_token_accuracy": 0.962159241437912, + "num_tokens": 818187.0, + "step": 775 + }, + { + "entropy": 0.15337594370008445, + "epoch": 0.07054829251085783, + "grad_norm": 2.53125, + "learning_rate": 0.000156513222331048, + "loss": 0.19895185470581056, + "mean_token_accuracy": 0.9605051165819168, + "num_tokens": 845825.0, + "step": 800 + }, + { + "entropy": 0.10168401733972132, + "epoch": 0.07275292665182213, + "grad_norm": 4.34375, + "learning_rate": 0.00016141038197845252, + "loss": 0.14234644889831544, + "mean_token_accuracy": 0.9697119271755219, + "num_tokens": 870399.0, + "step": 825 + }, + { + "entropy": 0.13999531235196627, + "epoch": 0.07495756079278644, + "grad_norm": 3.109375, + "learning_rate": 0.00016630754162585702, + "loss": 0.19501604080200197, + "mean_token_accuracy": 0.959660935997963, + "num_tokens": 897210.0, + "step": 850 + }, + { + "entropy": 0.1450615050335182, + "epoch": 0.07716219493375075, + "grad_norm": 7.125, + "learning_rate": 0.00017120470127326152, + "loss": 0.19098827362060547, + "mean_token_accuracy": 0.9595730248093605, + "num_tokens": 923017.0, + "step": 875 + }, + { + "entropy": 0.1744435393344611, + "epoch": 0.07936682907471505, + "grad_norm": 4.6875, + "learning_rate": 0.00017610186092066602, + "loss": 0.22231653213500976, + "mean_token_accuracy": 0.947464411854744, + "num_tokens": 950476.0, + "step": 900 + }, + { + "entropy": 0.13871764447540044, + "epoch": 0.08157146321567936, + "grad_norm": 2.75, + "learning_rate": 0.00018099902056807051, + "loss": 0.2051361083984375, + "mean_token_accuracy": 0.9618028599023819, + "num_tokens": 976674.0, + "step": 925 + }, + { + "entropy": 0.11772402749746107, + "epoch": 0.08377609735664367, + "grad_norm": 2.578125, + "learning_rate": 0.00018589618021547504, + "loss": 0.1542353343963623, + "mean_token_accuracy": 0.9612125977873802, + "num_tokens": 1002875.0, + "step": 950 + }, + { + "entropy": 0.14306295619462617, + "epoch": 0.08598073149760797, + "grad_norm": 4.75, + "learning_rate": 0.00019079333986287954, + "loss": 0.2011384391784668, + "mean_token_accuracy": 0.9559961825609207, + "num_tokens": 1028393.0, + "step": 975 + }, + { + "entropy": 0.1746880966401659, + "epoch": 0.08818536563857228, + "grad_norm": 4.71875, + "learning_rate": 0.00019569049951028404, + "loss": 0.24061763763427735, + "mean_token_accuracy": 0.9488253518939018, + "num_tokens": 1053692.0, + "step": 1000 + }, + { + "epoch": 0.08818536563857228, + "eval_entropy": 0.09399272183322045, + "eval_loss": 0.09840720146894455, + "eval_mean_token_accuracy": 0.9736589030234762, + "eval_num_tokens": 1053692.0, + "eval_runtime": 245.793, + "eval_samples_per_second": 15.985, + "eval_steps_per_second": 3.999, + "step": 1000 + }, + { + "entropy": 0.13214532324811443, + "epoch": 0.09038999977953659, + "grad_norm": 4.75, + "learning_rate": 0.00019999999592140393, + "loss": 0.17850215911865233, + "mean_token_accuracy": 0.9571027851104736, + "num_tokens": 1079429.0, + "step": 1025 + }, + { + "entropy": 0.16548033393686637, + "epoch": 0.0925946339205009, + "grad_norm": 4.65625, + "learning_rate": 0.0001999996447091715, + "loss": 0.22275726318359376, + "mean_token_accuracy": 0.9530195724964142, + "num_tokens": 1106553.0, + "step": 1050 + }, + { + "entropy": 0.14543476202059538, + "epoch": 0.0947992680614652, + "grad_norm": 5.21875, + "learning_rate": 0.00019999872702753714, + "loss": 0.180413875579834, + "mean_token_accuracy": 0.9578671759366989, + "num_tokens": 1132889.0, + "step": 1075 + }, + { + "entropy": 0.13837194723077118, + "epoch": 0.0970039022024295, + "grad_norm": 10.0625, + "learning_rate": 0.00019999724288169928, + "loss": 0.1911659622192383, + "mean_token_accuracy": 0.9591153275966644, + "num_tokens": 1159859.0, + "step": 1100 + }, + { + "entropy": 0.15544078743841966, + "epoch": 0.09920853634339381, + "grad_norm": 2.578125, + "learning_rate": 0.00019999519228006515, + "loss": 0.2121676254272461, + "mean_token_accuracy": 0.9535513919591904, + "num_tokens": 1187029.0, + "step": 1125 + }, + { + "entropy": 0.15761214319965802, + "epoch": 0.10141317048435812, + "grad_norm": 3.609375, + "learning_rate": 0.00019999257523425088, + "loss": 0.23544374465942383, + "mean_token_accuracy": 0.9549354958534241, + "num_tokens": 1212489.0, + "step": 1150 + }, + { + "entropy": 0.14129603882553055, + "epoch": 0.10361780462532243, + "grad_norm": 7.53125, + "learning_rate": 0.00019998939175908126, + "loss": 0.18428075790405274, + "mean_token_accuracy": 0.9605149912834168, + "num_tokens": 1239902.0, + "step": 1175 + }, + { + "entropy": 0.19539385408628732, + "epoch": 0.10582243876628673, + "grad_norm": 7.21875, + "learning_rate": 0.00019998564187258974, + "loss": 0.30669073104858396, + "mean_token_accuracy": 0.9401221024990082, + "num_tokens": 1267180.0, + "step": 1200 + }, + { + "entropy": 0.17899971085134894, + "epoch": 0.10802707290725104, + "grad_norm": 90.0, + "learning_rate": 0.0001999813255960184, + "loss": 0.2703672981262207, + "mean_token_accuracy": 0.9464505657553672, + "num_tokens": 1295006.0, + "step": 1225 + }, + { + "entropy": 0.15225840135361068, + "epoch": 0.11023170704821535, + "grad_norm": 5.09375, + "learning_rate": 0.00019997644295381765, + "loss": 0.18838932037353515, + "mean_token_accuracy": 0.9539206418395042, + "num_tokens": 1321227.0, + "step": 1250 + }, + { + "entropy": 0.16457684379653073, + "epoch": 0.11243634118917965, + "grad_norm": 17.875, + "learning_rate": 0.0001999709939736463, + "loss": 0.22236486434936523, + "mean_token_accuracy": 0.9525096288323402, + "num_tokens": 1348546.0, + "step": 1275 + }, + { + "entropy": 0.1539011792768724, + "epoch": 0.11464097533014396, + "grad_norm": 3.828125, + "learning_rate": 0.00019996497868637132, + "loss": 0.20238460540771486, + "mean_token_accuracy": 0.9542306599020958, + "num_tokens": 1374669.0, + "step": 1300 + }, + { + "entropy": 0.20363085102522746, + "epoch": 0.11684560947110827, + "grad_norm": 4.65625, + "learning_rate": 0.0001999583971260675, + "loss": 0.28179990768432617, + "mean_token_accuracy": 0.941902932524681, + "num_tokens": 1400529.0, + "step": 1325 + }, + { + "entropy": 0.1503680677805096, + "epoch": 0.11905024361207257, + "grad_norm": 15.3125, + "learning_rate": 0.00019995124933001754, + "loss": 0.22546863555908203, + "mean_token_accuracy": 0.9548282194137573, + "num_tokens": 1428330.0, + "step": 1350 + }, + { + "entropy": 0.13919373288343195, + "epoch": 0.12125487775303688, + "grad_norm": 5.15625, + "learning_rate": 0.00019994353533871165, + "loss": 0.1785750961303711, + "mean_token_accuracy": 0.9592090272903442, + "num_tokens": 1454023.0, + "step": 1375 + }, + { + "entropy": 0.1594487912580371, + "epoch": 0.12345951189400119, + "grad_norm": 3.1875, + "learning_rate": 0.0001999352551958474, + "loss": 0.22408544540405273, + "mean_token_accuracy": 0.9546812137961388, + "num_tokens": 1480660.0, + "step": 1400 + }, + { + "entropy": 0.12862279892200604, + "epoch": 0.1256641460349655, + "grad_norm": 4.65625, + "learning_rate": 0.00019992640894832945, + "loss": 0.1851462936401367, + "mean_token_accuracy": 0.9622556710243225, + "num_tokens": 1507742.0, + "step": 1425 + }, + { + "entropy": 0.12973011572903487, + "epoch": 0.1278687801759298, + "grad_norm": 13.4375, + "learning_rate": 0.00019991699664626924, + "loss": 0.182869815826416, + "mean_token_accuracy": 0.9545741581916809, + "num_tokens": 1534303.0, + "step": 1450 + }, + { + "entropy": 0.1497259267186746, + "epoch": 0.13007341431689412, + "grad_norm": 8.8125, + "learning_rate": 0.00019990701834298475, + "loss": 0.22879138946533203, + "mean_token_accuracy": 0.9533333089947701, + "num_tokens": 1560598.0, + "step": 1475 + }, + { + "entropy": 0.18277295206673444, + "epoch": 0.13227804845785843, + "grad_norm": 7.90625, + "learning_rate": 0.00019989647409500024, + "loss": 0.21807186126708986, + "mean_token_accuracy": 0.9539013829827309, + "num_tokens": 1587377.0, + "step": 1500 + }, + { + "entropy": 0.1608945361687802, + "epoch": 0.13448268259882273, + "grad_norm": 6.90625, + "learning_rate": 0.00019988536396204585, + "loss": 0.2067807388305664, + "mean_token_accuracy": 0.9522431689500809, + "num_tokens": 1614095.0, + "step": 1525 + }, + { + "entropy": 0.16176950700290035, + "epoch": 0.13668731673978704, + "grad_norm": 8.6875, + "learning_rate": 0.00019987368800705732, + "loss": 0.21591096878051758, + "mean_token_accuracy": 0.9503604331612587, + "num_tokens": 1639568.0, + "step": 1550 + }, + { + "entropy": 0.19014478008728475, + "epoch": 0.13889195088075135, + "grad_norm": 20.625, + "learning_rate": 0.0001998614462961756, + "loss": 0.2421278953552246, + "mean_token_accuracy": 0.9487079519033432, + "num_tokens": 1664964.0, + "step": 1575 + }, + { + "entropy": 0.17077196488622576, + "epoch": 0.14109658502171565, + "grad_norm": 7.84375, + "learning_rate": 0.00019984863889874646, + "loss": 0.2088161277770996, + "mean_token_accuracy": 0.9499969807267189, + "num_tokens": 1691979.0, + "step": 1600 + }, + { + "entropy": 0.1300421412358992, + "epoch": 0.14330121916267996, + "grad_norm": 10.875, + "learning_rate": 0.00019983526588732015, + "loss": 0.18558111190795898, + "mean_token_accuracy": 0.9627196085453034, + "num_tokens": 1717606.0, + "step": 1625 + }, + { + "entropy": 0.15682672704104333, + "epoch": 0.14550585330364427, + "grad_norm": 13.375, + "learning_rate": 0.000199821327337651, + "loss": 0.199498291015625, + "mean_token_accuracy": 0.9547498086094857, + "num_tokens": 1743479.0, + "step": 1650 + }, + { + "entropy": 0.13432513456908055, + "epoch": 0.14771048744460857, + "grad_norm": 6.65625, + "learning_rate": 0.00019980682332869685, + "loss": 0.17353324890136718, + "mean_token_accuracy": 0.959072844684124, + "num_tokens": 1769361.0, + "step": 1675 + }, + { + "entropy": 0.16332185944309457, + "epoch": 0.14991512158557288, + "grad_norm": 8.75, + "learning_rate": 0.0001997917539426188, + "loss": 0.2064606285095215, + "mean_token_accuracy": 0.9575307658314705, + "num_tokens": 1796448.0, + "step": 1700 + }, + { + "entropy": 0.1739040635782294, + "epoch": 0.1521197557265372, + "grad_norm": 5.90625, + "learning_rate": 0.00019977611926478062, + "loss": 0.21098520278930663, + "mean_token_accuracy": 0.954549497961998, + "num_tokens": 1822567.0, + "step": 1725 + }, + { + "entropy": 0.159052537526004, + "epoch": 0.1543243898675015, + "grad_norm": 21.125, + "learning_rate": 0.00019975991938374826, + "loss": 0.2069353485107422, + "mean_token_accuracy": 0.9509221604466438, + "num_tokens": 1850030.0, + "step": 1750 + }, + { + "entropy": 0.15716556440107524, + "epoch": 0.1565290240084658, + "grad_norm": 8.375, + "learning_rate": 0.00019974315439128944, + "loss": 0.19329355239868165, + "mean_token_accuracy": 0.9546804228425025, + "num_tokens": 1876869.0, + "step": 1775 + }, + { + "entropy": 0.1525935530057177, + "epoch": 0.1587336581494301, + "grad_norm": 6.09375, + "learning_rate": 0.000199725824382373, + "loss": 0.20368234634399415, + "mean_token_accuracy": 0.9573848494887351, + "num_tokens": 1902705.0, + "step": 1800 + }, + { + "entropy": 0.14915346608497201, + "epoch": 0.1609382922903944, + "grad_norm": 12.3125, + "learning_rate": 0.0001997079294551686, + "loss": 0.20571292877197267, + "mean_token_accuracy": 0.9549185973405838, + "num_tokens": 1929332.0, + "step": 1825 + }, + { + "entropy": 0.1540890626120381, + "epoch": 0.16314292643135872, + "grad_norm": 7.96875, + "learning_rate": 0.00019968946971104577, + "loss": 0.2072519302368164, + "mean_token_accuracy": 0.9564824342727661, + "num_tokens": 1956884.0, + "step": 1850 + }, + { + "entropy": 0.14470923049142584, + "epoch": 0.16534756057232303, + "grad_norm": 7.125, + "learning_rate": 0.00019967044525457373, + "loss": 0.17775110244750977, + "mean_token_accuracy": 0.9551727205514908, + "num_tokens": 1982453.0, + "step": 1875 + }, + { + "entropy": 0.15138954945839941, + "epoch": 0.16755219471328733, + "grad_norm": 9.375, + "learning_rate": 0.0001996508561935206, + "loss": 0.18922266006469726, + "mean_token_accuracy": 0.9547863042354584, + "num_tokens": 2009831.0, + "step": 1900 + }, + { + "entropy": 0.14155943776480853, + "epoch": 0.16975682885425164, + "grad_norm": 5.25, + "learning_rate": 0.0001996307026388528, + "loss": 0.20168161392211914, + "mean_token_accuracy": 0.9573139929771424, + "num_tokens": 2035463.0, + "step": 1925 + }, + { + "entropy": 0.13731083187274634, + "epoch": 0.17196146299521595, + "grad_norm": 5.59375, + "learning_rate": 0.00019960998470473445, + "loss": 0.20148942947387696, + "mean_token_accuracy": 0.9581686609983444, + "num_tokens": 2061128.0, + "step": 1950 + }, + { + "entropy": 0.18538200524868442, + "epoch": 0.17416609713618025, + "grad_norm": 6.1875, + "learning_rate": 0.0001995887025085267, + "loss": 0.2421389961242676, + "mean_token_accuracy": 0.9522135049104691, + "num_tokens": 2088131.0, + "step": 1975 + }, + { + "entropy": 0.1542595046805218, + "epoch": 0.17637073127714456, + "grad_norm": 13.4375, + "learning_rate": 0.00019956685617078716, + "loss": 0.17982059478759765, + "mean_token_accuracy": 0.9567183205485343, + "num_tokens": 2113958.0, + "step": 2000 + }, + { + "epoch": 0.17637073127714456, + "eval_entropy": 0.12889170395125116, + "eval_loss": 0.14472714066505432, + "eval_mean_token_accuracy": 0.9669632775252373, + "eval_num_tokens": 2113958.0, + "eval_runtime": 230.2268, + "eval_samples_per_second": 17.066, + "eval_steps_per_second": 4.27, + "step": 2000 + }, + { + "entropy": 0.17109735576668755, + "epoch": 0.17857536541810887, + "grad_norm": 15.625, + "learning_rate": 0.00019954444581526907, + "loss": 0.18888046264648437, + "mean_token_accuracy": 0.960274083018303, + "num_tokens": 2139315.0, + "step": 2025 + }, + { + "entropy": 0.2150292192818597, + "epoch": 0.18077999955907317, + "grad_norm": 3.125, + "learning_rate": 0.0001995214715689207, + "loss": 0.24926740646362305, + "mean_token_accuracy": 0.9541423147916794, + "num_tokens": 2165555.0, + "step": 2050 + }, + { + "entropy": 0.14501372674596497, + "epoch": 0.18298463370003748, + "grad_norm": 4.90625, + "learning_rate": 0.00019949793356188454, + "loss": 0.20213737487792968, + "mean_token_accuracy": 0.9615710428357125, + "num_tokens": 2191462.0, + "step": 2075 + }, + { + "entropy": 0.1310080810985528, + "epoch": 0.1851892678410018, + "grad_norm": 4.8125, + "learning_rate": 0.00019947383192749668, + "loss": 0.18873476028442382, + "mean_token_accuracy": 0.9583966609835625, + "num_tokens": 2216189.0, + "step": 2100 + }, + { + "entropy": 0.1508674483350478, + "epoch": 0.1873939019819661, + "grad_norm": 13.1875, + "learning_rate": 0.00019944916680228608, + "loss": 0.18426620483398437, + "mean_token_accuracy": 0.956863840520382, + "num_tokens": 2241622.0, + "step": 2125 + }, + { + "entropy": 0.19120873935054988, + "epoch": 0.1895985361229304, + "grad_norm": 9.125, + "learning_rate": 0.0001994239383259735, + "loss": 0.2455206298828125, + "mean_token_accuracy": 0.9497691139578819, + "num_tokens": 2268247.0, + "step": 2150 + }, + { + "entropy": 0.13146985903033057, + "epoch": 0.1918031702638947, + "grad_norm": 16.375, + "learning_rate": 0.00019939814664147112, + "loss": 0.19511665344238283, + "mean_token_accuracy": 0.9607363530993461, + "num_tokens": 2293502.0, + "step": 2175 + }, + { + "entropy": 0.1565953103499487, + "epoch": 0.194007804404859, + "grad_norm": 9.8125, + "learning_rate": 0.00019937179189488146, + "loss": 0.21843402862548827, + "mean_token_accuracy": 0.9554175850749016, + "num_tokens": 2318791.0, + "step": 2200 + }, + { + "entropy": 0.19252748679718934, + "epoch": 0.19621243854582332, + "grad_norm": 6.40625, + "learning_rate": 0.00019934487423549656, + "loss": 0.2356930160522461, + "mean_token_accuracy": 0.9517645919322968, + "num_tokens": 2345980.0, + "step": 2225 + }, + { + "entropy": 0.14177053164923564, + "epoch": 0.19841707268678763, + "grad_norm": 9.5, + "learning_rate": 0.00019931739381579737, + "loss": 0.20377872467041017, + "mean_token_accuracy": 0.9597736677527428, + "num_tokens": 2371597.0, + "step": 2250 + }, + { + "entropy": 0.1750909099751152, + "epoch": 0.20062170682775193, + "grad_norm": 2.84375, + "learning_rate": 0.00019928935079145254, + "loss": 0.19352615356445313, + "mean_token_accuracy": 0.9594111356139183, + "num_tokens": 2397749.0, + "step": 2275 + }, + { + "entropy": 0.1871402624528855, + "epoch": 0.20282634096871624, + "grad_norm": 9.4375, + "learning_rate": 0.00019926074532131778, + "loss": 0.23113348007202147, + "mean_token_accuracy": 0.9543161234259605, + "num_tokens": 2423692.0, + "step": 2300 + }, + { + "entropy": 0.15049185859272257, + "epoch": 0.20503097510968055, + "grad_norm": 4.5, + "learning_rate": 0.00019923157756743492, + "loss": 0.18978424072265626, + "mean_token_accuracy": 0.9580981060862541, + "num_tokens": 2449112.0, + "step": 2325 + }, + { + "entropy": 0.16552068044547924, + "epoch": 0.20723560925064485, + "grad_norm": 4.8125, + "learning_rate": 0.00019920184769503096, + "loss": 0.23359138488769532, + "mean_token_accuracy": 0.9517715626955032, + "num_tokens": 2475135.0, + "step": 2350 + }, + { + "entropy": 0.17032419282593764, + "epoch": 0.20944024339160916, + "grad_norm": 13.5625, + "learning_rate": 0.00019917155587251712, + "loss": 0.20869092941284179, + "mean_token_accuracy": 0.9506035950779915, + "num_tokens": 2502512.0, + "step": 2375 + }, + { + "entropy": 0.15413983556907623, + "epoch": 0.21164487753257347, + "grad_norm": 8.6875, + "learning_rate": 0.00019914070227148795, + "loss": 0.20199440002441407, + "mean_token_accuracy": 0.9544082489609719, + "num_tokens": 2528612.0, + "step": 2400 + }, + { + "entropy": 0.20244524533161892, + "epoch": 0.21384951167353777, + "grad_norm": 7.34375, + "learning_rate": 0.00019910928706672022, + "loss": 0.26269786834716796, + "mean_token_accuracy": 0.9423273745179176, + "num_tokens": 2555625.0, + "step": 2425 + }, + { + "entropy": 0.16426463149138726, + "epoch": 0.21605414581450208, + "grad_norm": 3.90625, + "learning_rate": 0.0001990773104361721, + "loss": 0.2108094596862793, + "mean_token_accuracy": 0.953407633304596, + "num_tokens": 2581347.0, + "step": 2450 + }, + { + "entropy": 0.18900560716865583, + "epoch": 0.2182587799554664, + "grad_norm": 5.96875, + "learning_rate": 0.0001990447725609821, + "loss": 0.23020565032958984, + "mean_token_accuracy": 0.9502732402086258, + "num_tokens": 2607478.0, + "step": 2475 + }, + { + "entropy": 0.14198036040528678, + "epoch": 0.2204634140964307, + "grad_norm": 6.75, + "learning_rate": 0.0001990116736254679, + "loss": 0.17339960098266602, + "mean_token_accuracy": 0.9606603449583053, + "num_tokens": 2634683.0, + "step": 2500 + }, + { + "entropy": 0.16489507200894876, + "epoch": 0.222668048237395, + "grad_norm": 8.75, + "learning_rate": 0.00019897801381712563, + "loss": 0.22926589965820313, + "mean_token_accuracy": 0.9573606929183006, + "num_tokens": 2662560.0, + "step": 2525 + }, + { + "entropy": 0.14484784842003137, + "epoch": 0.2248726823783593, + "grad_norm": 8.125, + "learning_rate": 0.00019894379332662836, + "loss": 0.20587156295776368, + "mean_token_accuracy": 0.9606982052326203, + "num_tokens": 2689760.0, + "step": 2550 + }, + { + "entropy": 0.1534224282670766, + "epoch": 0.2270773165193236, + "grad_norm": 6.75, + "learning_rate": 0.0001989090123478255, + "loss": 0.21122034072875975, + "mean_token_accuracy": 0.9542035666108132, + "num_tokens": 2715288.0, + "step": 2575 + }, + { + "entropy": 0.16028283250052483, + "epoch": 0.22928195066028792, + "grad_norm": 4.625, + "learning_rate": 0.00019887367107774125, + "loss": 0.20950069427490234, + "mean_token_accuracy": 0.9586973160505294, + "num_tokens": 2740557.0, + "step": 2600 + }, + { + "entropy": 0.18298680773936213, + "epoch": 0.23148658480125223, + "grad_norm": 8.3125, + "learning_rate": 0.00019883776971657384, + "loss": 0.2443247604370117, + "mean_token_accuracy": 0.9503592163324356, + "num_tokens": 2767449.0, + "step": 2625 + }, + { + "entropy": 0.1627288061589934, + "epoch": 0.23369121894221653, + "grad_norm": 12.375, + "learning_rate": 0.00019880130846769425, + "loss": 0.20207910537719725, + "mean_token_accuracy": 0.953170590698719, + "num_tokens": 2792767.0, + "step": 2650 + }, + { + "entropy": 0.15129281114554033, + "epoch": 0.23589585308318084, + "grad_norm": 10.8125, + "learning_rate": 0.000198764287537645, + "loss": 0.19894697189331054, + "mean_token_accuracy": 0.9556801280379296, + "num_tokens": 2819173.0, + "step": 2675 + }, + { + "entropy": 0.13785832320805638, + "epoch": 0.23810048722414515, + "grad_norm": 8.3125, + "learning_rate": 0.00019872670713613907, + "loss": 0.16127649307250977, + "mean_token_accuracy": 0.9632558959722519, + "num_tokens": 2844158.0, + "step": 2700 + }, + { + "entropy": 0.17196971918223425, + "epoch": 0.24030512136510945, + "grad_norm": 13.875, + "learning_rate": 0.00019868856747605872, + "loss": 0.22303541183471678, + "mean_token_accuracy": 0.9536935070157051, + "num_tokens": 2870212.0, + "step": 2725 + }, + { + "entropy": 0.14516899693524465, + "epoch": 0.24250975550607376, + "grad_norm": 10.5, + "learning_rate": 0.0001986498687734542, + "loss": 0.18350645065307616, + "mean_token_accuracy": 0.9615237146615982, + "num_tokens": 2896169.0, + "step": 2750 + }, + { + "entropy": 0.19394321402534842, + "epoch": 0.24471438964703807, + "grad_norm": 4.71875, + "learning_rate": 0.00019861061124754262, + "loss": 0.23362028121948242, + "mean_token_accuracy": 0.9451329198479652, + "num_tokens": 2924429.0, + "step": 2775 + }, + { + "entropy": 0.1593242084258236, + "epoch": 0.24691902378800237, + "grad_norm": 36.0, + "learning_rate": 0.00019857079512070663, + "loss": 0.2111642074584961, + "mean_token_accuracy": 0.9549560195207596, + "num_tokens": 2949725.0, + "step": 2800 + }, + { + "entropy": 0.14827293824637308, + "epoch": 0.24912365792896668, + "grad_norm": 8.75, + "learning_rate": 0.00019853042061849317, + "loss": 0.18694892883300782, + "mean_token_accuracy": 0.9584823083877564, + "num_tokens": 2975320.0, + "step": 2825 + }, + { + "entropy": 0.14805610009469092, + "epoch": 0.251328292069931, + "grad_norm": 4.8125, + "learning_rate": 0.00019848948796961233, + "loss": 0.1870688819885254, + "mean_token_accuracy": 0.9585736668109894, + "num_tokens": 3000789.0, + "step": 2850 + }, + { + "entropy": 0.1488057920546271, + "epoch": 0.2535329262108953, + "grad_norm": 2.78125, + "learning_rate": 0.00019844799740593582, + "loss": 0.2043631935119629, + "mean_token_accuracy": 0.9576079681515693, + "num_tokens": 3027263.0, + "step": 2875 + }, + { + "entropy": 0.17447588493814692, + "epoch": 0.2557375603518596, + "grad_norm": 5.09375, + "learning_rate": 0.0001984059491624958, + "loss": 0.22025297164916993, + "mean_token_accuracy": 0.9506224057078362, + "num_tokens": 3053711.0, + "step": 2900 + }, + { + "entropy": 0.1623274303250946, + "epoch": 0.2579421944928239, + "grad_norm": 3.96875, + "learning_rate": 0.00019836334347748358, + "loss": 0.19379936218261717, + "mean_token_accuracy": 0.9566253125667572, + "num_tokens": 3080007.0, + "step": 2925 + }, + { + "entropy": 0.16122146823909134, + "epoch": 0.26014682863378824, + "grad_norm": 9.375, + "learning_rate": 0.0001983201805922482, + "loss": 0.22965421676635742, + "mean_token_accuracy": 0.9526374578475952, + "num_tokens": 3107320.0, + "step": 2950 + }, + { + "entropy": 0.20952579901786522, + "epoch": 0.2623514627747525, + "grad_norm": 10.875, + "learning_rate": 0.00019827646075129502, + "loss": 0.26936729431152345, + "mean_token_accuracy": 0.9549751174449921, + "num_tokens": 3133692.0, + "step": 2975 + }, + { + "entropy": 0.145962798174005, + "epoch": 0.26455609691571685, + "grad_norm": 9.9375, + "learning_rate": 0.0001982321842022845, + "loss": 0.19020818710327148, + "mean_token_accuracy": 0.9561607944965362, + "num_tokens": 3160508.0, + "step": 3000 + }, + { + "epoch": 0.26455609691571685, + "eval_entropy": 0.11402115050896562, + "eval_loss": 0.09617123752832413, + "eval_mean_token_accuracy": 0.9735903109320546, + "eval_num_tokens": 3160508.0, + "eval_runtime": 229.2109, + "eval_samples_per_second": 17.141, + "eval_steps_per_second": 4.289, + "step": 3000 + }, + { + "entropy": 0.1764632138889283, + "epoch": 0.26676073105668113, + "grad_norm": 5.15625, + "learning_rate": 0.0001981873511960306, + "loss": 0.22552785873413086, + "mean_token_accuracy": 0.9531935697793961, + "num_tokens": 3188120.0, + "step": 3025 + }, + { + "entropy": 0.14757733151782304, + "epoch": 0.26896536519764547, + "grad_norm": 7.96875, + "learning_rate": 0.00019814196198649948, + "loss": 0.22005025863647462, + "mean_token_accuracy": 0.9574960014224052, + "num_tokens": 3214224.0, + "step": 3050 + }, + { + "entropy": 0.13798468225868418, + "epoch": 0.27116999933860975, + "grad_norm": 5.5625, + "learning_rate": 0.00019809601683080805, + "loss": 0.18730186462402343, + "mean_token_accuracy": 0.9637488493323326, + "num_tokens": 3240698.0, + "step": 3075 + }, + { + "entropy": 0.1589315331657417, + "epoch": 0.2733746334795741, + "grad_norm": 6.9375, + "learning_rate": 0.0001980495159892225, + "loss": 0.19896188735961914, + "mean_token_accuracy": 0.9575601083040237, + "num_tokens": 3265159.0, + "step": 3100 + }, + { + "entropy": 0.17124603053322063, + "epoch": 0.27557926762053836, + "grad_norm": 3.84375, + "learning_rate": 0.00019800245972515675, + "loss": 0.22188325881958007, + "mean_token_accuracy": 0.9544751027226448, + "num_tokens": 3293130.0, + "step": 3125 + }, + { + "entropy": 0.17563861726317553, + "epoch": 0.2777839017615027, + "grad_norm": 11.3125, + "learning_rate": 0.0001979548483051711, + "loss": 0.20833553314208986, + "mean_token_accuracy": 0.9533888497948646, + "num_tokens": 3319208.0, + "step": 3150 + }, + { + "entropy": 0.17662670996272936, + "epoch": 0.279988535902467, + "grad_norm": 7.125, + "learning_rate": 0.00019790668199897072, + "loss": 0.2158352851867676, + "mean_token_accuracy": 0.9493980967998504, + "num_tokens": 3345268.0, + "step": 3175 + }, + { + "entropy": 0.20901564525905997, + "epoch": 0.2821931700434313, + "grad_norm": 6.3125, + "learning_rate": 0.00019785796107940385, + "loss": 0.2893720817565918, + "mean_token_accuracy": 0.9452108177542686, + "num_tokens": 3373762.0, + "step": 3200 + }, + { + "entropy": 0.15227315539959818, + "epoch": 0.2843978041843956, + "grad_norm": 2.84375, + "learning_rate": 0.00019780868582246064, + "loss": 0.1822994613647461, + "mean_token_accuracy": 0.9600504431128501, + "num_tokens": 3400602.0, + "step": 3225 + }, + { + "entropy": 0.15691042528487742, + "epoch": 0.2866024383253599, + "grad_norm": 7.375, + "learning_rate": 0.0001977588565072713, + "loss": 0.23442533493041992, + "mean_token_accuracy": 0.9516582262516021, + "num_tokens": 3427327.0, + "step": 3250 + }, + { + "entropy": 0.15344003664329647, + "epoch": 0.2888070724663242, + "grad_norm": 7.96875, + "learning_rate": 0.0001977084734161047, + "loss": 0.19131439208984374, + "mean_token_accuracy": 0.9592954310774803, + "num_tokens": 3453597.0, + "step": 3275 + }, + { + "entropy": 0.17065959631931038, + "epoch": 0.29101170660728853, + "grad_norm": 6.5, + "learning_rate": 0.00019765753683436663, + "loss": 0.23813190460205078, + "mean_token_accuracy": 0.9509798160195351, + "num_tokens": 3480758.0, + "step": 3300 + }, + { + "entropy": 0.16926921415608376, + "epoch": 0.2932163407482528, + "grad_norm": 6.21875, + "learning_rate": 0.00019760604705059822, + "loss": 0.2048003387451172, + "mean_token_accuracy": 0.9565244841575623, + "num_tokens": 3506743.0, + "step": 3325 + }, + { + "entropy": 0.1781733432924375, + "epoch": 0.29542097488921715, + "grad_norm": 3.140625, + "learning_rate": 0.00019755400435647445, + "loss": 0.24166810989379883, + "mean_token_accuracy": 0.9541838404536247, + "num_tokens": 3533219.0, + "step": 3350 + }, + { + "entropy": 0.15910907412180678, + "epoch": 0.2976256090301814, + "grad_norm": 4.34375, + "learning_rate": 0.00019750140904680223, + "loss": 0.2406574821472168, + "mean_token_accuracy": 0.952444885969162, + "num_tokens": 3559206.0, + "step": 3375 + }, + { + "entropy": 0.17944070099154488, + "epoch": 0.29983024317114576, + "grad_norm": 4.78125, + "learning_rate": 0.00019744826141951903, + "loss": 0.24596309661865234, + "mean_token_accuracy": 0.9527160394191742, + "num_tokens": 3585456.0, + "step": 3400 + }, + { + "entropy": 0.14929035058245063, + "epoch": 0.30203487731211004, + "grad_norm": 4.40625, + "learning_rate": 0.00019739456177569092, + "loss": 0.20019350051879883, + "mean_token_accuracy": 0.9583090448379517, + "num_tokens": 3611511.0, + "step": 3425 + }, + { + "entropy": 0.15263430486433208, + "epoch": 0.3042395114530744, + "grad_norm": 4.53125, + "learning_rate": 0.000197340310419511, + "loss": 0.19227680206298828, + "mean_token_accuracy": 0.956867307126522, + "num_tokens": 3637287.0, + "step": 3450 + }, + { + "entropy": 0.15233371320180594, + "epoch": 0.30644414559403865, + "grad_norm": 15.125, + "learning_rate": 0.0001972855076582978, + "loss": 0.1797281265258789, + "mean_token_accuracy": 0.9600564774870872, + "num_tokens": 3662617.0, + "step": 3475 + }, + { + "entropy": 0.1706169345683884, + "epoch": 0.308648779735003, + "grad_norm": 5.28125, + "learning_rate": 0.0001972301538024932, + "loss": 0.22658414840698243, + "mean_token_accuracy": 0.9483333799242973, + "num_tokens": 3689357.0, + "step": 3500 + }, + { + "entropy": 0.19369855729979463, + "epoch": 0.31085341387596727, + "grad_norm": 13.625, + "learning_rate": 0.00019717424916566102, + "loss": 0.2644779968261719, + "mean_token_accuracy": 0.9458743578195572, + "num_tokens": 3718450.0, + "step": 3525 + }, + { + "entropy": 0.17568644701968877, + "epoch": 0.3130580480169316, + "grad_norm": 7.28125, + "learning_rate": 0.00019711779406448505, + "loss": 0.22591154098510743, + "mean_token_accuracy": 0.955237175822258, + "num_tokens": 3745358.0, + "step": 3550 + }, + { + "entropy": 0.15584104033769108, + "epoch": 0.3152626821578959, + "grad_norm": 7.40625, + "learning_rate": 0.00019706078881876724, + "loss": 0.1931936836242676, + "mean_token_accuracy": 0.9567597940564155, + "num_tokens": 3770330.0, + "step": 3575 + }, + { + "entropy": 0.1786574741289951, + "epoch": 0.3174673162988602, + "grad_norm": 6.59375, + "learning_rate": 0.00019700323375142608, + "loss": 0.22003387451171874, + "mean_token_accuracy": 0.9521878311038017, + "num_tokens": 3796581.0, + "step": 3600 + }, + { + "entropy": 0.1740282563189976, + "epoch": 0.3196719504398245, + "grad_norm": 6.84375, + "learning_rate": 0.00019694512918849453, + "loss": 0.20711589813232423, + "mean_token_accuracy": 0.9538468858599662, + "num_tokens": 3822606.0, + "step": 3625 + }, + { + "entropy": 0.14037064747535624, + "epoch": 0.3218765845807888, + "grad_norm": 7.96875, + "learning_rate": 0.00019688647545911832, + "loss": 0.1778187370300293, + "mean_token_accuracy": 0.9586274382472039, + "num_tokens": 3848390.0, + "step": 3650 + }, + { + "entropy": 0.15400171629153192, + "epoch": 0.3240812187217531, + "grad_norm": 7.6875, + "learning_rate": 0.00019682727289555417, + "loss": 0.21966312408447267, + "mean_token_accuracy": 0.9530961990356446, + "num_tokens": 3873771.0, + "step": 3675 + }, + { + "entropy": 0.192363264701562, + "epoch": 0.32628585286271744, + "grad_norm": 5.09375, + "learning_rate": 0.00019676752183316753, + "loss": 0.2318318748474121, + "mean_token_accuracy": 0.9510029405355453, + "num_tokens": 3898700.0, + "step": 3700 + }, + { + "entropy": 0.15507790248957462, + "epoch": 0.3284904870036817, + "grad_norm": 6.25, + "learning_rate": 0.00019670722261043119, + "loss": 0.20907793045043946, + "mean_token_accuracy": 0.9551364532113076, + "num_tokens": 3925248.0, + "step": 3725 + }, + { + "entropy": 0.18347839491441845, + "epoch": 0.33069512114464605, + "grad_norm": 4.96875, + "learning_rate": 0.0001966463755689229, + "loss": 0.22271282196044923, + "mean_token_accuracy": 0.9531350857019425, + "num_tokens": 3952118.0, + "step": 3750 + }, + { + "entropy": 0.1369088706956245, + "epoch": 0.33289975528561033, + "grad_norm": 6.28125, + "learning_rate": 0.00019658498105332392, + "loss": 0.16287109375, + "mean_token_accuracy": 0.9628218576312065, + "num_tokens": 3978522.0, + "step": 3775 + }, + { + "entropy": 0.15609612919040955, + "epoch": 0.33510438942657467, + "grad_norm": 7.34375, + "learning_rate": 0.0001965230394114165, + "loss": 0.19069808959960938, + "mean_token_accuracy": 0.9597328254580497, + "num_tokens": 4003867.0, + "step": 3800 + }, + { + "entropy": 0.1823128617950715, + "epoch": 0.33730902356753895, + "grad_norm": 2.59375, + "learning_rate": 0.0001964605509940824, + "loss": 0.24169843673706054, + "mean_token_accuracy": 0.9522805500030518, + "num_tokens": 4031117.0, + "step": 3825 + }, + { + "entropy": 0.15791643084958196, + "epoch": 0.3395136577085033, + "grad_norm": 7.96875, + "learning_rate": 0.00019639751615530059, + "loss": 0.19818052291870117, + "mean_token_accuracy": 0.9531289768218995, + "num_tokens": 4056962.0, + "step": 3850 + }, + { + "entropy": 0.16439166482770814, + "epoch": 0.34171829184946756, + "grad_norm": 4.65625, + "learning_rate": 0.00019633393525214548, + "loss": 0.21556419372558594, + "mean_token_accuracy": 0.955651236474514, + "num_tokens": 4083717.0, + "step": 3875 + }, + { + "entropy": 0.18368883373914285, + "epoch": 0.3439229259904319, + "grad_norm": 5.5625, + "learning_rate": 0.00019626980864478462, + "loss": 0.2704266929626465, + "mean_token_accuracy": 0.9481986343860627, + "num_tokens": 4111693.0, + "step": 3900 + }, + { + "entropy": 0.18578361921128816, + "epoch": 0.3461275601313962, + "grad_norm": 6.40625, + "learning_rate": 0.000196205136696477, + "loss": 0.23863527297973633, + "mean_token_accuracy": 0.951669595837593, + "num_tokens": 4138165.0, + "step": 3925 + }, + { + "entropy": 0.1771517199045047, + "epoch": 0.3483321942723605, + "grad_norm": 4.78125, + "learning_rate": 0.00019613991977357066, + "loss": 0.23310329437255858, + "mean_token_accuracy": 0.9477997007966041, + "num_tokens": 4166504.0, + "step": 3950 + }, + { + "entropy": 0.18780332374793943, + "epoch": 0.3505368284133248, + "grad_norm": 5.78125, + "learning_rate": 0.00019607415824550087, + "loss": 0.23442667007446288, + "mean_token_accuracy": 0.9532361942529678, + "num_tokens": 4191893.0, + "step": 3975 + }, + { + "entropy": 0.18824178458191454, + "epoch": 0.3527414625542891, + "grad_norm": 6.125, + "learning_rate": 0.0001960078524847879, + "loss": 0.2508979606628418, + "mean_token_accuracy": 0.9472871825098992, + "num_tokens": 4218282.0, + "step": 4000 + }, + { + "epoch": 0.3527414625542891, + "eval_entropy": 0.11648815209642886, + "eval_loss": 0.10695337504148483, + "eval_mean_token_accuracy": 0.9704467630580386, + "eval_num_tokens": 4218282.0, + "eval_runtime": 245.6179, + "eval_samples_per_second": 15.996, + "eval_steps_per_second": 4.002, + "step": 4000 + }, + { + "entropy": 0.1659863335173577, + "epoch": 0.3549460966952534, + "grad_norm": 5.375, + "learning_rate": 0.00019594100286703486, + "loss": 0.22078191757202148, + "mean_token_accuracy": 0.9524767264723778, + "num_tokens": 4243916.0, + "step": 4025 + }, + { + "entropy": 0.14453170219901948, + "epoch": 0.35715073083621773, + "grad_norm": 3.03125, + "learning_rate": 0.00019587360977092573, + "loss": 0.205191707611084, + "mean_token_accuracy": 0.9619744899868965, + "num_tokens": 4269856.0, + "step": 4050 + }, + { + "entropy": 0.1569202733691782, + "epoch": 0.359355364977182, + "grad_norm": 4.40625, + "learning_rate": 0.00019580567357822321, + "loss": 0.18136905670166015, + "mean_token_accuracy": 0.9600639209151268, + "num_tokens": 4295878.0, + "step": 4075 + }, + { + "entropy": 0.18661301417043433, + "epoch": 0.36155999911814635, + "grad_norm": 7.5, + "learning_rate": 0.00019573719467376636, + "loss": 0.27021623611450196, + "mean_token_accuracy": 0.949309915304184, + "num_tokens": 4323336.0, + "step": 4100 + }, + { + "entropy": 0.170799303437816, + "epoch": 0.3637646332591106, + "grad_norm": 2.8125, + "learning_rate": 0.00019566817344546862, + "loss": 0.2194487190246582, + "mean_token_accuracy": 0.9553793686628341, + "num_tokens": 4348815.0, + "step": 4125 + }, + { + "entropy": 0.16419745851657352, + "epoch": 0.36596926740007496, + "grad_norm": 3.296875, + "learning_rate": 0.00019559861028431547, + "loss": 0.2253945541381836, + "mean_token_accuracy": 0.9522608941793442, + "num_tokens": 4375715.0, + "step": 4150 + }, + { + "entropy": 0.15910948771517724, + "epoch": 0.36817390154103924, + "grad_norm": 6.40625, + "learning_rate": 0.00019552850558436242, + "loss": 0.2144721794128418, + "mean_token_accuracy": 0.9533983466029167, + "num_tokens": 4401090.0, + "step": 4175 + }, + { + "entropy": 0.16045264942571522, + "epoch": 0.3703785356820036, + "grad_norm": 5.0625, + "learning_rate": 0.00019545785974273247, + "loss": 0.1865951156616211, + "mean_token_accuracy": 0.9595495608448982, + "num_tokens": 4426453.0, + "step": 4200 + }, + { + "entropy": 0.16479153966531157, + "epoch": 0.37258316982296785, + "grad_norm": 5.09375, + "learning_rate": 0.00019538667315961415, + "loss": 0.1973789405822754, + "mean_token_accuracy": 0.9551339733600617, + "num_tokens": 4451172.0, + "step": 4225 + }, + { + "entropy": 0.19048072915524245, + "epoch": 0.3747878039639322, + "grad_norm": 8.0625, + "learning_rate": 0.00019531494623825917, + "loss": 0.2748769378662109, + "mean_token_accuracy": 0.9468750369548797, + "num_tokens": 4479694.0, + "step": 4250 + }, + { + "entropy": 0.1996890745626297, + "epoch": 0.37699243810489647, + "grad_norm": 12.0, + "learning_rate": 0.0001952426793849799, + "loss": 0.23543556213378905, + "mean_token_accuracy": 0.9533787325024605, + "num_tokens": 4506605.0, + "step": 4275 + }, + { + "entropy": 0.17569803014863283, + "epoch": 0.3791970722458608, + "grad_norm": 8.6875, + "learning_rate": 0.00019516987300914753, + "loss": 0.22635995864868164, + "mean_token_accuracy": 0.9482355606555939, + "num_tokens": 4533336.0, + "step": 4300 + }, + { + "entropy": 0.18504171287175267, + "epoch": 0.3814017063868251, + "grad_norm": 5.09375, + "learning_rate": 0.0001950965275231893, + "loss": 0.23127546310424804, + "mean_token_accuracy": 0.9499982151389122, + "num_tokens": 4560672.0, + "step": 4325 + }, + { + "entropy": 0.1998164102109149, + "epoch": 0.3836063405277894, + "grad_norm": 6.71875, + "learning_rate": 0.00019502264334258644, + "loss": 0.2719668769836426, + "mean_token_accuracy": 0.9475091502070427, + "num_tokens": 4587955.0, + "step": 4350 + }, + { + "entropy": 0.16743096579564734, + "epoch": 0.38581097466875375, + "grad_norm": 7.6875, + "learning_rate": 0.00019494822088587168, + "loss": 0.22320510864257812, + "mean_token_accuracy": 0.9563138785958291, + "num_tokens": 4614744.0, + "step": 4375 + }, + { + "entropy": 0.15541932496009395, + "epoch": 0.388015608809718, + "grad_norm": 5.34375, + "learning_rate": 0.00019487326057462704, + "loss": 0.18758811950683593, + "mean_token_accuracy": 0.9599009090662003, + "num_tokens": 4640942.0, + "step": 4400 + }, + { + "entropy": 0.153234211106319, + "epoch": 0.39022024295068236, + "grad_norm": 6.8125, + "learning_rate": 0.0001947977628334812, + "loss": 0.1849317741394043, + "mean_token_accuracy": 0.9574062630534173, + "num_tokens": 4666916.0, + "step": 4425 + }, + { + "entropy": 0.19829942288808525, + "epoch": 0.39242487709164664, + "grad_norm": 6.03125, + "learning_rate": 0.0001947217280901073, + "loss": 0.2654203224182129, + "mean_token_accuracy": 0.9488379114866257, + "num_tokens": 4693082.0, + "step": 4450 + }, + { + "entropy": 0.16388011447619646, + "epoch": 0.394629511232611, + "grad_norm": 6.65625, + "learning_rate": 0.00019464515677522037, + "loss": 0.20659452438354492, + "mean_token_accuracy": 0.9587969416379929, + "num_tokens": 4718146.0, + "step": 4475 + }, + { + "entropy": 0.15318717606249266, + "epoch": 0.39683414537357525, + "grad_norm": 1.7265625, + "learning_rate": 0.00019456804932257513, + "loss": 0.21035289764404297, + "mean_token_accuracy": 0.9577053633332252, + "num_tokens": 4745031.0, + "step": 4500 + }, + { + "entropy": 0.16182554476428776, + "epoch": 0.3990387795145396, + "grad_norm": 7.53125, + "learning_rate": 0.00019449040616896314, + "loss": 0.22390464782714845, + "mean_token_accuracy": 0.9582294577360153, + "num_tokens": 4770398.0, + "step": 4525 + }, + { + "entropy": 0.16457911616773344, + "epoch": 0.40124341365550387, + "grad_norm": 5.71875, + "learning_rate": 0.00019441222775421076, + "loss": 0.2118742561340332, + "mean_token_accuracy": 0.9571871975064278, + "num_tokens": 4795649.0, + "step": 4550 + }, + { + "entropy": 0.17897488735150546, + "epoch": 0.4034480477964682, + "grad_norm": 6.5625, + "learning_rate": 0.00019433351452117635, + "loss": 0.22478347778320312, + "mean_token_accuracy": 0.9507955679297447, + "num_tokens": 4821974.0, + "step": 4575 + }, + { + "entropy": 0.20635093068238347, + "epoch": 0.4056526819374325, + "grad_norm": 5.375, + "learning_rate": 0.00019425426691574785, + "loss": 0.30243276596069335, + "mean_token_accuracy": 0.940371046513319, + "num_tokens": 4850609.0, + "step": 4600 + }, + { + "entropy": 0.16062138011795468, + "epoch": 0.4078573160783968, + "grad_norm": 6.1875, + "learning_rate": 0.00019417448538684026, + "loss": 0.20213171005249023, + "mean_token_accuracy": 0.9580634304881096, + "num_tokens": 4876172.0, + "step": 4625 + }, + { + "entropy": 0.15726084834081122, + "epoch": 0.4100619502193611, + "grad_norm": 4.46875, + "learning_rate": 0.00019409417038639322, + "loss": 0.2142583656311035, + "mean_token_accuracy": 0.9570022109150886, + "num_tokens": 4902385.0, + "step": 4650 + }, + { + "entropy": 0.17165626873378642, + "epoch": 0.4122665843603254, + "grad_norm": 7.0, + "learning_rate": 0.00019401332236936817, + "loss": 0.23059816360473634, + "mean_token_accuracy": 0.9527977633476258, + "num_tokens": 4929222.0, + "step": 4675 + }, + { + "entropy": 0.1640852385875769, + "epoch": 0.4144712185012897, + "grad_norm": 5.46875, + "learning_rate": 0.00019393194179374604, + "loss": 0.2273613166809082, + "mean_token_accuracy": 0.9548634958267211, + "num_tokens": 4955671.0, + "step": 4700 + }, + { + "entropy": 0.1570255648170132, + "epoch": 0.41667585264225404, + "grad_norm": 2.734375, + "learning_rate": 0.00019385002912052454, + "loss": 0.2143065071105957, + "mean_token_accuracy": 0.9560745039582252, + "num_tokens": 4982931.0, + "step": 4725 + }, + { + "entropy": 0.17041039526229723, + "epoch": 0.4188804867832183, + "grad_norm": 4.0, + "learning_rate": 0.00019376758481371556, + "loss": 0.22071348190307616, + "mean_token_accuracy": 0.9523491749167442, + "num_tokens": 5010878.0, + "step": 4750 + }, + { + "entropy": 0.16272783821914344, + "epoch": 0.42108512092418265, + "grad_norm": 5.84375, + "learning_rate": 0.0001936846093403425, + "loss": 0.2147397804260254, + "mean_token_accuracy": 0.9509309217333793, + "num_tokens": 5036498.0, + "step": 4775 + }, + { + "entropy": 0.19207242332515306, + "epoch": 0.42328975506514693, + "grad_norm": 4.125, + "learning_rate": 0.00019360110317043772, + "loss": 0.2490982246398926, + "mean_token_accuracy": 0.9515020102262497, + "num_tokens": 5063787.0, + "step": 4800 + }, + { + "entropy": 0.17644908792804925, + "epoch": 0.42549438920611127, + "grad_norm": 9.875, + "learning_rate": 0.00019351706677703975, + "loss": 0.23829484939575196, + "mean_token_accuracy": 0.9441190361976624, + "num_tokens": 5091038.0, + "step": 4825 + }, + { + "entropy": 0.14317711226758548, + "epoch": 0.42769902334707555, + "grad_norm": 6.875, + "learning_rate": 0.00019343250063619082, + "loss": 0.18992048263549804, + "mean_token_accuracy": 0.9606717613339424, + "num_tokens": 5118165.0, + "step": 4850 + }, + { + "entropy": 0.18368843147065492, + "epoch": 0.4299036574880399, + "grad_norm": 19.0, + "learning_rate": 0.00019334740522693392, + "loss": 0.2248593521118164, + "mean_token_accuracy": 0.9531078413128853, + "num_tokens": 5144207.0, + "step": 4875 + }, + { + "entropy": 0.1561399988271296, + "epoch": 0.43210829162900416, + "grad_norm": 5.78125, + "learning_rate": 0.00019326178103131017, + "loss": 0.2132600212097168, + "mean_token_accuracy": 0.9495953798294068, + "num_tokens": 5169691.0, + "step": 4900 + }, + { + "entropy": 0.14101963342865928, + "epoch": 0.4343129257699685, + "grad_norm": 2.734375, + "learning_rate": 0.0001931756285343562, + "loss": 0.18087581634521485, + "mean_token_accuracy": 0.9611552309989929, + "num_tokens": 5195767.0, + "step": 4925 + }, + { + "entropy": 0.16966247914591803, + "epoch": 0.4365175599109328, + "grad_norm": 3.4375, + "learning_rate": 0.0001930889482241013, + "loss": 0.2111412239074707, + "mean_token_accuracy": 0.9541240110993385, + "num_tokens": 5222731.0, + "step": 4950 + }, + { + "entropy": 0.1666613586130552, + "epoch": 0.4387221940518971, + "grad_norm": 7.125, + "learning_rate": 0.0001930017405915646, + "loss": 0.2021429443359375, + "mean_token_accuracy": 0.9588029065728187, + "num_tokens": 5249113.0, + "step": 4975 + }, + { + "entropy": 0.14360476849251427, + "epoch": 0.4409268281928614, + "grad_norm": 6.71875, + "learning_rate": 0.00019291400613075243, + "loss": 0.22467676162719727, + "mean_token_accuracy": 0.955079542696476, + "num_tokens": 5276016.0, + "step": 5000 + }, + { + "epoch": 0.4409268281928614, + "eval_entropy": 0.10869147787145353, + "eval_loss": 0.10057255625724792, + "eval_mean_token_accuracy": 0.9738030348158805, + "eval_num_tokens": 5276016.0, + "eval_runtime": 227.4944, + "eval_samples_per_second": 17.271, + "eval_steps_per_second": 4.321, + "step": 5000 + }, + { + "entropy": 0.13281712058582343, + "epoch": 0.4431314623338257, + "grad_norm": 16.75, + "learning_rate": 0.00019282574533865542, + "loss": 0.1655169105529785, + "mean_token_accuracy": 0.9662536835670471, + "num_tokens": 5299609.0, + "step": 5025 + }, + { + "entropy": 0.15537576926173643, + "epoch": 0.44533609647479, + "grad_norm": 3.53125, + "learning_rate": 0.00019273695871524575, + "loss": 0.18183206558227538, + "mean_token_accuracy": 0.961754854619503, + "num_tokens": 5324890.0, + "step": 5050 + }, + { + "entropy": 0.17722081926651298, + "epoch": 0.44754073061575433, + "grad_norm": 5.53125, + "learning_rate": 0.00019264764676347427, + "loss": 0.2252979850769043, + "mean_token_accuracy": 0.9469957205653191, + "num_tokens": 5352166.0, + "step": 5075 + }, + { + "entropy": 0.16153283580671995, + "epoch": 0.4497453647567186, + "grad_norm": 5.96875, + "learning_rate": 0.00019255780998926763, + "loss": 0.20538576126098632, + "mean_token_accuracy": 0.9543823391199112, + "num_tokens": 5379076.0, + "step": 5100 + }, + { + "entropy": 0.17977383355842902, + "epoch": 0.45194999889768295, + "grad_norm": 5.53125, + "learning_rate": 0.00019246744890152545, + "loss": 0.2403662109375, + "mean_token_accuracy": 0.9503585311770439, + "num_tokens": 5404897.0, + "step": 5125 + }, + { + "entropy": 0.14496052238857374, + "epoch": 0.4541546330386472, + "grad_norm": 5.375, + "learning_rate": 0.00019237656401211757, + "loss": 0.19734691619873046, + "mean_token_accuracy": 0.9596367552876472, + "num_tokens": 5431544.0, + "step": 5150 + }, + { + "entropy": 0.16713952826918102, + "epoch": 0.45635926717961156, + "grad_norm": 6.75, + "learning_rate": 0.00019228515583588079, + "loss": 0.22258760452270507, + "mean_token_accuracy": 0.9565337428450584, + "num_tokens": 5457123.0, + "step": 5175 + }, + { + "entropy": 0.16149105805088765, + "epoch": 0.45856390132057584, + "grad_norm": 8.5625, + "learning_rate": 0.00019219322489061634, + "loss": 0.22547231674194335, + "mean_token_accuracy": 0.9535848796367645, + "num_tokens": 5484041.0, + "step": 5200 + }, + { + "entropy": 0.148517404072918, + "epoch": 0.4607685354615402, + "grad_norm": 3.0625, + "learning_rate": 0.00019210077169708675, + "loss": 0.2167955207824707, + "mean_token_accuracy": 0.9557743620872498, + "num_tokens": 5511905.0, + "step": 5225 + }, + { + "entropy": 0.1500109832943417, + "epoch": 0.46297316960250445, + "grad_norm": 4.0625, + "learning_rate": 0.00019200779677901295, + "loss": 0.21606193542480467, + "mean_token_accuracy": 0.9585763025283813, + "num_tokens": 5537774.0, + "step": 5250 + }, + { + "entropy": 0.1966726200678386, + "epoch": 0.4651778037434688, + "grad_norm": 5.8125, + "learning_rate": 0.00019191430066307124, + "loss": 0.23859842300415038, + "mean_token_accuracy": 0.9467063054442406, + "num_tokens": 5565426.0, + "step": 5275 + }, + { + "entropy": 0.17433940736344083, + "epoch": 0.46738243788443307, + "grad_norm": 4.46875, + "learning_rate": 0.0001918202838788904, + "loss": 0.224346923828125, + "mean_token_accuracy": 0.9562974636256695, + "num_tokens": 5591888.0, + "step": 5300 + }, + { + "entropy": 0.13106359140831045, + "epoch": 0.4695870720253974, + "grad_norm": 5.5625, + "learning_rate": 0.0001917257469590487, + "loss": 0.1792782211303711, + "mean_token_accuracy": 0.9603891870379448, + "num_tokens": 5617317.0, + "step": 5325 + }, + { + "entropy": 0.15177158450707792, + "epoch": 0.4717917061663617, + "grad_norm": 3.921875, + "learning_rate": 0.00019163069043907064, + "loss": 0.21172012329101564, + "mean_token_accuracy": 0.9570096290111542, + "num_tokens": 5643652.0, + "step": 5350 + }, + { + "entropy": 0.1602694686234463, + "epoch": 0.473996340307326, + "grad_norm": 7.65625, + "learning_rate": 0.00019153511485742435, + "loss": 0.19996971130371094, + "mean_token_accuracy": 0.9593239459395408, + "num_tokens": 5669256.0, + "step": 5375 + }, + { + "entropy": 0.15173581497278066, + "epoch": 0.4762009744482903, + "grad_norm": 3.71875, + "learning_rate": 0.0001914390207555181, + "loss": 0.19701797485351563, + "mean_token_accuracy": 0.9582894539833069, + "num_tokens": 5694392.0, + "step": 5400 + }, + { + "entropy": 0.1408918967458885, + "epoch": 0.4784056085892546, + "grad_norm": 4.28125, + "learning_rate": 0.00019134240867769756, + "loss": 0.17704242706298828, + "mean_token_accuracy": 0.9625860941410065, + "num_tokens": 5720793.0, + "step": 5425 + }, + { + "entropy": 0.15985940964194015, + "epoch": 0.4806102427302189, + "grad_norm": 5.59375, + "learning_rate": 0.0001912452791712425, + "loss": 0.2061847686767578, + "mean_token_accuracy": 0.9506743770837783, + "num_tokens": 5748648.0, + "step": 5450 + }, + { + "entropy": 0.1572291606734507, + "epoch": 0.48281487687118324, + "grad_norm": 12.5, + "learning_rate": 0.00019114763278636385, + "loss": 0.1869456672668457, + "mean_token_accuracy": 0.9610082852840424, + "num_tokens": 5774702.0, + "step": 5475 + }, + { + "entropy": 0.15579537914483807, + "epoch": 0.4850195110121475, + "grad_norm": 3.328125, + "learning_rate": 0.00019104947007620045, + "loss": 0.22569913864135743, + "mean_token_accuracy": 0.9553910180926323, + "num_tokens": 5800390.0, + "step": 5500 + }, + { + "entropy": 0.1744847889256198, + "epoch": 0.48722414515311185, + "grad_norm": 5.40625, + "learning_rate": 0.00019095079159681596, + "loss": 0.2129666328430176, + "mean_token_accuracy": 0.9534970700740815, + "num_tokens": 5828020.0, + "step": 5525 + }, + { + "entropy": 0.12404027001583018, + "epoch": 0.48942877929407613, + "grad_norm": 3.71875, + "learning_rate": 0.0001908515979071958, + "loss": 0.13661216735839843, + "mean_token_accuracy": 0.9668267214298248, + "num_tokens": 5851718.0, + "step": 5550 + }, + { + "entropy": 0.15956679730443285, + "epoch": 0.49163341343504047, + "grad_norm": 8.75, + "learning_rate": 0.00019075188956924386, + "loss": 0.21861886978149414, + "mean_token_accuracy": 0.9555191951990127, + "num_tokens": 5879049.0, + "step": 5575 + }, + { + "entropy": 0.15942344037815928, + "epoch": 0.49383804757600475, + "grad_norm": 5.0625, + "learning_rate": 0.00019065166714777934, + "loss": 0.20734643936157227, + "mean_token_accuracy": 0.9620197328925133, + "num_tokens": 5904931.0, + "step": 5600 + }, + { + "entropy": 0.19217802144237794, + "epoch": 0.4960426817169691, + "grad_norm": 4.53125, + "learning_rate": 0.00019055093121053365, + "loss": 0.26606002807617185, + "mean_token_accuracy": 0.9508296462893486, + "num_tokens": 5931600.0, + "step": 5625 + }, + { + "entropy": 0.15677706636604852, + "epoch": 0.49824731585793336, + "grad_norm": 4.28125, + "learning_rate": 0.00019044968232814703, + "loss": 0.2043045425415039, + "mean_token_accuracy": 0.9544167664647102, + "num_tokens": 5957753.0, + "step": 5650 + }, + { + "entropy": 0.18609977641841396, + "epoch": 0.5004519499988976, + "grad_norm": 3.578125, + "learning_rate": 0.00019034792107416553, + "loss": 0.21894699096679687, + "mean_token_accuracy": 0.9534892725944519, + "num_tokens": 5985243.0, + "step": 5675 + }, + { + "entropy": 0.14997990630334243, + "epoch": 0.502656584139862, + "grad_norm": 7.3125, + "learning_rate": 0.0001902456480250375, + "loss": 0.19551092147827148, + "mean_token_accuracy": 0.9585751879215241, + "num_tokens": 6010402.0, + "step": 5700 + }, + { + "entropy": 0.1572682385914959, + "epoch": 0.5048612182808263, + "grad_norm": 4.34375, + "learning_rate": 0.00019014286376011055, + "loss": 0.19206081390380858, + "mean_token_accuracy": 0.9578534030914306, + "num_tokens": 6037075.0, + "step": 5725 + }, + { + "entropy": 0.15716539665358142, + "epoch": 0.5070658524217906, + "grad_norm": 5.03125, + "learning_rate": 0.00019003956886162816, + "loss": 0.2075516700744629, + "mean_token_accuracy": 0.9590855090320111, + "num_tokens": 6063439.0, + "step": 5750 + }, + { + "entropy": 0.14328026061179117, + "epoch": 0.5092704865627549, + "grad_norm": 9.3125, + "learning_rate": 0.0001899357639147264, + "loss": 0.1804123878479004, + "mean_token_accuracy": 0.9630845382809639, + "num_tokens": 6089276.0, + "step": 5775 + }, + { + "entropy": 0.1787011620606063, + "epoch": 0.5114751207037193, + "grad_norm": 3.296875, + "learning_rate": 0.0001898314495074306, + "loss": 0.21809492111206055, + "mean_token_accuracy": 0.955017312169075, + "num_tokens": 6116074.0, + "step": 5800 + }, + { + "entropy": 0.14735706645878963, + "epoch": 0.5136797548446835, + "grad_norm": 11.625, + "learning_rate": 0.0001897266262306521, + "loss": 0.18583986282348633, + "mean_token_accuracy": 0.9612311086058617, + "num_tokens": 6142300.0, + "step": 5825 + }, + { + "entropy": 0.13354225278948434, + "epoch": 0.5158843889856478, + "grad_norm": 4.71875, + "learning_rate": 0.0001896212946781848, + "loss": 0.16306705474853517, + "mean_token_accuracy": 0.9689052039384842, + "num_tokens": 6167238.0, + "step": 5850 + }, + { + "entropy": 0.1298945102340076, + "epoch": 0.5180890231266121, + "grad_norm": 3.296875, + "learning_rate": 0.0001895154554467018, + "loss": 0.15991472244262694, + "mean_token_accuracy": 0.9638067600131035, + "num_tokens": 6193840.0, + "step": 5875 + }, + { + "entropy": 0.14238889124128037, + "epoch": 0.5202936572675765, + "grad_norm": 4.375, + "learning_rate": 0.00018940910913575206, + "loss": 0.19000024795532228, + "mean_token_accuracy": 0.9612208670377731, + "num_tokens": 6219503.0, + "step": 5900 + }, + { + "entropy": 0.20510360905434935, + "epoch": 0.5224982914085408, + "grad_norm": 3.625, + "learning_rate": 0.00018930225634775715, + "loss": 0.26463899612426756, + "mean_token_accuracy": 0.9487945803999901, + "num_tokens": 6247058.0, + "step": 5925 + }, + { + "entropy": 0.1622152561810799, + "epoch": 0.524702925549505, + "grad_norm": 4.5625, + "learning_rate": 0.00018919489768800746, + "loss": 0.25773733139038085, + "mean_token_accuracy": 0.9568088221549987, + "num_tokens": 6273064.0, + "step": 5950 + }, + { + "entropy": 0.187178902864689, + "epoch": 0.5269075596904693, + "grad_norm": 4.15625, + "learning_rate": 0.00018908703376465917, + "loss": 0.24045007705688476, + "mean_token_accuracy": 0.9494410088658333, + "num_tokens": 6299137.0, + "step": 5975 + }, + { + "entropy": 0.1452439275966026, + "epoch": 0.5291121938314337, + "grad_norm": 9.625, + "learning_rate": 0.00018897866518873053, + "loss": 0.21541589736938477, + "mean_token_accuracy": 0.9575134646892548, + "num_tokens": 6324785.0, + "step": 6000 + }, + { + "epoch": 0.5291121938314337, + "eval_entropy": 0.10676932354227452, + "eval_loss": 0.10486993938684464, + "eval_mean_token_accuracy": 0.971874090444035, + "eval_num_tokens": 6324785.0, + "eval_runtime": 226.9895, + "eval_samples_per_second": 17.309, + "eval_steps_per_second": 4.331, + "step": 6000 + }, + { + "entropy": 0.17648567588767036, + "epoch": 0.531316827972398, + "grad_norm": 3.734375, + "learning_rate": 0.0001888697925740986, + "loss": 0.20909486770629881, + "mean_token_accuracy": 0.954936962723732, + "num_tokens": 6351238.0, + "step": 6025 + }, + { + "entropy": 0.16155564374057577, + "epoch": 0.5335214621133623, + "grad_norm": 1.28125, + "learning_rate": 0.00018876041653749552, + "loss": 0.1900315284729004, + "mean_token_accuracy": 0.9586120668053627, + "num_tokens": 6378264.0, + "step": 6050 + }, + { + "entropy": 0.15692011128994637, + "epoch": 0.5357260962543265, + "grad_norm": 6.65625, + "learning_rate": 0.00018865053769850538, + "loss": 0.2241463279724121, + "mean_token_accuracy": 0.9549520462751389, + "num_tokens": 6405407.0, + "step": 6075 + }, + { + "entropy": 0.16224516270449385, + "epoch": 0.5379307303952909, + "grad_norm": 5.03125, + "learning_rate": 0.00018854015667956034, + "loss": 0.20405168533325196, + "mean_token_accuracy": 0.9515541243553162, + "num_tokens": 6432386.0, + "step": 6100 + }, + { + "entropy": 0.1313534512137994, + "epoch": 0.5401353645362552, + "grad_norm": 3.390625, + "learning_rate": 0.00018842927410593732, + "loss": 0.16415348052978515, + "mean_token_accuracy": 0.9615388405323029, + "num_tokens": 6458905.0, + "step": 6125 + }, + { + "entropy": 0.1441493243572768, + "epoch": 0.5423399986772195, + "grad_norm": 4.75, + "learning_rate": 0.00018831789060575442, + "loss": 0.20023174285888673, + "mean_token_accuracy": 0.9581418094038964, + "num_tokens": 6485447.0, + "step": 6150 + }, + { + "entropy": 0.16584216450923123, + "epoch": 0.5445446328181838, + "grad_norm": 9.0, + "learning_rate": 0.0001882060068099673, + "loss": 0.22106365203857423, + "mean_token_accuracy": 0.9604924789071083, + "num_tokens": 6511693.0, + "step": 6175 + }, + { + "entropy": 0.1466969000035897, + "epoch": 0.5467492669591482, + "grad_norm": 3.546875, + "learning_rate": 0.00018809362335236575, + "loss": 0.1853495407104492, + "mean_token_accuracy": 0.9564072874188423, + "num_tokens": 6538084.0, + "step": 6200 + }, + { + "entropy": 0.14249630046426318, + "epoch": 0.5489539011001124, + "grad_norm": 2.890625, + "learning_rate": 0.00018798074086956988, + "loss": 0.16907304763793946, + "mean_token_accuracy": 0.9658751469850541, + "num_tokens": 6563262.0, + "step": 6225 + }, + { + "entropy": 0.13506768403225577, + "epoch": 0.5511585352410767, + "grad_norm": 5.15625, + "learning_rate": 0.00018786736000102664, + "loss": 0.1854391860961914, + "mean_token_accuracy": 0.9624115920066834, + "num_tokens": 6590039.0, + "step": 6250 + }, + { + "entropy": 0.15223577088676393, + "epoch": 0.553363169382041, + "grad_norm": 7.3125, + "learning_rate": 0.00018775348138900632, + "loss": 0.20777523040771484, + "mean_token_accuracy": 0.9607916563749314, + "num_tokens": 6615751.0, + "step": 6275 + }, + { + "entropy": 0.1341767003881978, + "epoch": 0.5555678035230054, + "grad_norm": 6.03125, + "learning_rate": 0.00018763910567859868, + "loss": 0.18818994522094726, + "mean_token_accuracy": 0.9605262127518653, + "num_tokens": 6641586.0, + "step": 6300 + }, + { + "entropy": 0.1822560296789743, + "epoch": 0.5577724376639697, + "grad_norm": 4.53125, + "learning_rate": 0.00018752423351770943, + "loss": 0.21607881546020508, + "mean_token_accuracy": 0.9530105289816856, + "num_tokens": 6668241.0, + "step": 6325 + }, + { + "entropy": 0.15559989049565048, + "epoch": 0.559977071804934, + "grad_norm": 1.0078125, + "learning_rate": 0.00018740886555705647, + "loss": 0.18804313659667968, + "mean_token_accuracy": 0.9619828999042511, + "num_tokens": 6694831.0, + "step": 6350 + }, + { + "entropy": 0.15529708388843574, + "epoch": 0.5621817059458982, + "grad_norm": 5.46875, + "learning_rate": 0.00018729300245016642, + "loss": 0.21643871307373047, + "mean_token_accuracy": 0.9549458172917366, + "num_tokens": 6721275.0, + "step": 6375 + }, + { + "entropy": 0.14793858348275535, + "epoch": 0.5643863400868626, + "grad_norm": 2.890625, + "learning_rate": 0.00018717664485337057, + "loss": 0.19648042678833008, + "mean_token_accuracy": 0.9602294343709946, + "num_tokens": 6747046.0, + "step": 6400 + }, + { + "entropy": 0.17088732279487887, + "epoch": 0.5665909742278269, + "grad_norm": 4.75, + "learning_rate": 0.00018705979342580146, + "loss": 0.2485208511352539, + "mean_token_accuracy": 0.9500703465938568, + "num_tokens": 6774554.0, + "step": 6425 + }, + { + "entropy": 0.15431675165193157, + "epoch": 0.5687956083687912, + "grad_norm": 2.859375, + "learning_rate": 0.00018694244882938907, + "loss": 0.1909835433959961, + "mean_token_accuracy": 0.9602775621414185, + "num_tokens": 6800505.0, + "step": 6450 + }, + { + "entropy": 0.15024025222170168, + "epoch": 0.5710002425097555, + "grad_norm": 4.375, + "learning_rate": 0.00018682461172885698, + "loss": 0.20096403121948242, + "mean_token_accuracy": 0.9598821967840194, + "num_tokens": 6827587.0, + "step": 6475 + }, + { + "entropy": 0.15101411423878744, + "epoch": 0.5732048766507198, + "grad_norm": 4.75, + "learning_rate": 0.00018670628279171862, + "loss": 0.2041637420654297, + "mean_token_accuracy": 0.9568271943926812, + "num_tokens": 6854524.0, + "step": 6500 + }, + { + "entropy": 0.17572721259552054, + "epoch": 0.5754095107916841, + "grad_norm": 6.8125, + "learning_rate": 0.0001865874626882737, + "loss": 0.2292483901977539, + "mean_token_accuracy": 0.956166204214096, + "num_tokens": 6880994.0, + "step": 6525 + }, + { + "entropy": 0.11857264762860723, + "epoch": 0.5776141449326484, + "grad_norm": 3.046875, + "learning_rate": 0.00018646815209160406, + "loss": 0.1415870189666748, + "mean_token_accuracy": 0.9724202772974968, + "num_tokens": 6906538.0, + "step": 6550 + }, + { + "entropy": 0.18178646504296922, + "epoch": 0.5798187790736128, + "grad_norm": 2.828125, + "learning_rate": 0.00018634835167757015, + "loss": 0.25645376205444337, + "mean_token_accuracy": 0.9543059349060059, + "num_tokens": 6934582.0, + "step": 6575 + }, + { + "entropy": 0.15401741547277198, + "epoch": 0.5820234132145771, + "grad_norm": 9.25, + "learning_rate": 0.00018622806212480707, + "loss": 0.21368270874023437, + "mean_token_accuracy": 0.9566551733016968, + "num_tokens": 6960422.0, + "step": 6600 + }, + { + "entropy": 0.14491610620287246, + "epoch": 0.5842280473555413, + "grad_norm": 5.71875, + "learning_rate": 0.0001861072841147207, + "loss": 0.18427518844604493, + "mean_token_accuracy": 0.9608854773640633, + "num_tokens": 6987570.0, + "step": 6625 + }, + { + "entropy": 0.16291028478881345, + "epoch": 0.5864326814965056, + "grad_norm": 4.09375, + "learning_rate": 0.00018598601833148405, + "loss": 0.20536699295043945, + "mean_token_accuracy": 0.9561599251627922, + "num_tokens": 7015159.0, + "step": 6650 + }, + { + "entropy": 0.15031578235328197, + "epoch": 0.58863731563747, + "grad_norm": 5.96875, + "learning_rate": 0.00018586426546203302, + "loss": 0.2286543083190918, + "mean_token_accuracy": 0.9568568438291549, + "num_tokens": 7041683.0, + "step": 6675 + }, + { + "entropy": 0.15518903702031822, + "epoch": 0.5908419497784343, + "grad_norm": 7.90625, + "learning_rate": 0.00018574202619606287, + "loss": 0.20627035140991212, + "mean_token_accuracy": 0.9582181671261787, + "num_tokens": 7067898.0, + "step": 6700 + }, + { + "entropy": 0.1478130117879482, + "epoch": 0.5930465839193986, + "grad_norm": 8.1875, + "learning_rate": 0.0001856193012260241, + "loss": 0.17444656372070313, + "mean_token_accuracy": 0.9628511995077134, + "num_tokens": 7093130.0, + "step": 6725 + }, + { + "entropy": 0.16152513926150278, + "epoch": 0.5952512180603629, + "grad_norm": 5.0, + "learning_rate": 0.00018549609124711853, + "loss": 0.2013174057006836, + "mean_token_accuracy": 0.9573907378315926, + "num_tokens": 7119773.0, + "step": 6750 + }, + { + "entropy": 0.16015219626016916, + "epoch": 0.5974558522013272, + "grad_norm": 4.375, + "learning_rate": 0.0001853723969572955, + "loss": 0.20601200103759765, + "mean_token_accuracy": 0.9583083838224411, + "num_tokens": 7147090.0, + "step": 6775 + }, + { + "entropy": 0.12353788227774203, + "epoch": 0.5996604863422915, + "grad_norm": 3.734375, + "learning_rate": 0.00018524821905724782, + "loss": 0.1696053123474121, + "mean_token_accuracy": 0.9629499089717865, + "num_tokens": 7173628.0, + "step": 6800 + }, + { + "entropy": 0.13745511685323436, + "epoch": 0.6018651204832558, + "grad_norm": 7.625, + "learning_rate": 0.0001851235582504078, + "loss": 0.19153583526611329, + "mean_token_accuracy": 0.962202197611332, + "num_tokens": 7199491.0, + "step": 6825 + }, + { + "entropy": 0.15157184965210035, + "epoch": 0.6040697546242201, + "grad_norm": 4.125, + "learning_rate": 0.00018499841524294324, + "loss": 0.21617828369140624, + "mean_token_accuracy": 0.956987452507019, + "num_tokens": 7225688.0, + "step": 6850 + }, + { + "entropy": 0.16270094153587705, + "epoch": 0.6062743887651845, + "grad_norm": 5.0, + "learning_rate": 0.00018487279074375353, + "loss": 0.19658201217651367, + "mean_token_accuracy": 0.9590710332989693, + "num_tokens": 7251979.0, + "step": 6875 + }, + { + "entropy": 0.13592043185140937, + "epoch": 0.6084790229061487, + "grad_norm": 4.1875, + "learning_rate": 0.00018474668546446555, + "loss": 0.17008283615112305, + "mean_token_accuracy": 0.9622329398989677, + "num_tokens": 7278416.0, + "step": 6900 + }, + { + "entropy": 0.15448786061489955, + "epoch": 0.610683657047113, + "grad_norm": 5.59375, + "learning_rate": 0.0001846201001194296, + "loss": 0.2108542251586914, + "mean_token_accuracy": 0.9560682138800621, + "num_tokens": 7305245.0, + "step": 6925 + }, + { + "entropy": 0.17204750362201593, + "epoch": 0.6128882911880773, + "grad_norm": 3.1875, + "learning_rate": 0.0001844930354257156, + "loss": 0.22721254348754882, + "mean_token_accuracy": 0.9545628592371941, + "num_tokens": 7331880.0, + "step": 6950 + }, + { + "entropy": 0.17656717666075564, + "epoch": 0.6150929253290417, + "grad_norm": 4.4375, + "learning_rate": 0.00018436549210310862, + "loss": 0.23982276916503906, + "mean_token_accuracy": 0.9470600582659244, + "num_tokens": 7358764.0, + "step": 6975 + }, + { + "entropy": 0.14622056474676356, + "epoch": 0.617297559470006, + "grad_norm": 2.3125, + "learning_rate": 0.00018423747087410513, + "loss": 0.17314342498779298, + "mean_token_accuracy": 0.9619392481446266, + "num_tokens": 7385487.0, + "step": 7000 + }, + { + "epoch": 0.617297559470006, + "eval_entropy": 0.07945120481571462, + "eval_loss": 0.08194578438997269, + "eval_mean_token_accuracy": 0.9766436420525079, + "eval_num_tokens": 7385487.0, + "eval_runtime": 240.9795, + "eval_samples_per_second": 16.304, + "eval_steps_per_second": 4.079, + "step": 7000 + }, + { + "entropy": 0.13026506319874898, + "epoch": 0.6195021936109703, + "grad_norm": 7.40625, + "learning_rate": 0.0001841089724639088, + "loss": 0.1890636444091797, + "mean_token_accuracy": 0.9626975417137146, + "num_tokens": 7410738.0, + "step": 7025 + }, + { + "entropy": 0.1509742072003428, + "epoch": 0.6217068277519345, + "grad_norm": 5.09375, + "learning_rate": 0.00018397999760042644, + "loss": 0.21692123413085937, + "mean_token_accuracy": 0.9581372204422951, + "num_tokens": 7437696.0, + "step": 7050 + }, + { + "entropy": 0.1558595033886377, + "epoch": 0.6239114618928989, + "grad_norm": 5.15625, + "learning_rate": 0.00018385054701426372, + "loss": 0.22689201354980468, + "mean_token_accuracy": 0.9588596966862678, + "num_tokens": 7463958.0, + "step": 7075 + }, + { + "entropy": 0.13060404141549953, + "epoch": 0.6261160960338632, + "grad_norm": 4.875, + "learning_rate": 0.00018372062143872127, + "loss": 0.1784954261779785, + "mean_token_accuracy": 0.9633405435085297, + "num_tokens": 7489503.0, + "step": 7100 + }, + { + "entropy": 0.14149722107453272, + "epoch": 0.6283207301748275, + "grad_norm": 7.34375, + "learning_rate": 0.00018359022160979027, + "loss": 0.1862514877319336, + "mean_token_accuracy": 0.9588928538560867, + "num_tokens": 7515587.0, + "step": 7125 + }, + { + "entropy": 0.14298222974175587, + "epoch": 0.6305253643157918, + "grad_norm": 4.21875, + "learning_rate": 0.0001834593482661485, + "loss": 0.1809256362915039, + "mean_token_accuracy": 0.9628977358341217, + "num_tokens": 7542878.0, + "step": 7150 + }, + { + "entropy": 0.13202147453674115, + "epoch": 0.6327299984567561, + "grad_norm": 4.71875, + "learning_rate": 0.0001833280021491561, + "loss": 0.18633668899536132, + "mean_token_accuracy": 0.9624699577689171, + "num_tokens": 7569706.0, + "step": 7175 + }, + { + "entropy": 0.15507492707343773, + "epoch": 0.6349346325977204, + "grad_norm": 6.625, + "learning_rate": 0.00018319618400285115, + "loss": 0.20065113067626952, + "mean_token_accuracy": 0.9619992870092392, + "num_tokens": 7596763.0, + "step": 7200 + }, + { + "entropy": 0.15959744892315939, + "epoch": 0.6371392667386847, + "grad_norm": 8.6875, + "learning_rate": 0.0001830638945739459, + "loss": 0.20813972473144532, + "mean_token_accuracy": 0.956459388434887, + "num_tokens": 7623283.0, + "step": 7225 + }, + { + "entropy": 0.15210882770828904, + "epoch": 0.639343900879649, + "grad_norm": 5.1875, + "learning_rate": 0.00018293113461182204, + "loss": 0.20834075927734375, + "mean_token_accuracy": 0.9586859959363937, + "num_tokens": 7649230.0, + "step": 7250 + }, + { + "entropy": 0.14857870964333414, + "epoch": 0.6415485350206134, + "grad_norm": 4.0, + "learning_rate": 0.00018279790486852693, + "loss": 0.19865785598754881, + "mean_token_accuracy": 0.9572332391142845, + "num_tokens": 7676765.0, + "step": 7275 + }, + { + "entropy": 0.1303456796729006, + "epoch": 0.6437531691615777, + "grad_norm": 2.4375, + "learning_rate": 0.00018266420609876885, + "loss": 0.16255685806274414, + "mean_token_accuracy": 0.963574868440628, + "num_tokens": 7702032.0, + "step": 7300 + }, + { + "entropy": 0.1390564618736971, + "epoch": 0.6459578033025419, + "grad_norm": 5.125, + "learning_rate": 0.0001825300390599132, + "loss": 0.15615715980529785, + "mean_token_accuracy": 0.9627314421534539, + "num_tokens": 7727282.0, + "step": 7325 + }, + { + "entropy": 0.15579893960035407, + "epoch": 0.6481624374435062, + "grad_norm": 2.515625, + "learning_rate": 0.0001823954045119779, + "loss": 0.1994219970703125, + "mean_token_accuracy": 0.9553171017765999, + "num_tokens": 7754466.0, + "step": 7350 + }, + { + "entropy": 0.1711575994535815, + "epoch": 0.6503670715844706, + "grad_norm": 5.34375, + "learning_rate": 0.0001822603032176291, + "loss": 0.2133725929260254, + "mean_token_accuracy": 0.9545243400335311, + "num_tokens": 7781859.0, + "step": 7375 + }, + { + "entropy": 0.12246490114834159, + "epoch": 0.6525717057254349, + "grad_norm": 4.625, + "learning_rate": 0.00018212473594217708, + "loss": 0.15401289939880372, + "mean_token_accuracy": 0.9670217049121856, + "num_tokens": 7807817.0, + "step": 7400 + }, + { + "entropy": 0.14397913629829417, + "epoch": 0.6547763398663992, + "grad_norm": 4.71875, + "learning_rate": 0.00018198870345357169, + "loss": 0.2187386131286621, + "mean_token_accuracy": 0.9631443306803703, + "num_tokens": 7834226.0, + "step": 7425 + }, + { + "entropy": 0.12889255251706344, + "epoch": 0.6569809740073634, + "grad_norm": 4.0625, + "learning_rate": 0.00018185220652239807, + "loss": 0.17164052963256837, + "mean_token_accuracy": 0.9652298155426979, + "num_tokens": 7860383.0, + "step": 7450 + }, + { + "entropy": 0.14436891936697066, + "epoch": 0.6591856081483278, + "grad_norm": 2.625, + "learning_rate": 0.00018171524592187237, + "loss": 0.1861957550048828, + "mean_token_accuracy": 0.9579941752552986, + "num_tokens": 7885728.0, + "step": 7475 + }, + { + "entropy": 0.15438391114235855, + "epoch": 0.6613902422892921, + "grad_norm": 5.625, + "learning_rate": 0.00018157782242783722, + "loss": 0.21105031967163085, + "mean_token_accuracy": 0.9592954632639885, + "num_tokens": 7912175.0, + "step": 7500 + }, + { + "entropy": 0.1324482882732991, + "epoch": 0.6635948764302564, + "grad_norm": 2.09375, + "learning_rate": 0.00018143993681875737, + "loss": 0.1580478572845459, + "mean_token_accuracy": 0.9634089484810829, + "num_tokens": 7938822.0, + "step": 7525 + }, + { + "entropy": 0.14074511501239612, + "epoch": 0.6657995105712207, + "grad_norm": 8.625, + "learning_rate": 0.00018130158987571547, + "loss": 0.1955801010131836, + "mean_token_accuracy": 0.9611077249050141, + "num_tokens": 7964876.0, + "step": 7550 + }, + { + "entropy": 0.13242758714361116, + "epoch": 0.668004144712185, + "grad_norm": 4.78125, + "learning_rate": 0.00018116278238240735, + "loss": 0.15283177375793458, + "mean_token_accuracy": 0.9638253870606422, + "num_tokens": 7990616.0, + "step": 7575 + }, + { + "entropy": 0.11454111101804301, + "epoch": 0.6702087788531493, + "grad_norm": 5.9375, + "learning_rate": 0.0001810235151251378, + "loss": 0.14967763900756836, + "mean_token_accuracy": 0.9682707318663597, + "num_tokens": 8016050.0, + "step": 7600 + }, + { + "entropy": 0.11883930406940635, + "epoch": 0.6724134129941136, + "grad_norm": 4.34375, + "learning_rate": 0.00018088378889281602, + "loss": 0.15552496910095215, + "mean_token_accuracy": 0.9669474244117737, + "num_tokens": 8042320.0, + "step": 7625 + }, + { + "entropy": 0.1356273195706308, + "epoch": 0.6746180471350779, + "grad_norm": 6.625, + "learning_rate": 0.00018074360447695113, + "loss": 0.19098442077636718, + "mean_token_accuracy": 0.9609265148639679, + "num_tokens": 8068871.0, + "step": 7650 + }, + { + "entropy": 0.15299125998280944, + "epoch": 0.6768226812760423, + "grad_norm": 5.21875, + "learning_rate": 0.00018060296267164789, + "loss": 0.17693784713745117, + "mean_token_accuracy": 0.9612268942594528, + "num_tokens": 8094092.0, + "step": 7675 + }, + { + "entropy": 0.12178941715159454, + "epoch": 0.6790273154170066, + "grad_norm": 4.875, + "learning_rate": 0.00018046186427360177, + "loss": 0.17469738006591798, + "mean_token_accuracy": 0.9637601950764656, + "num_tokens": 8120040.0, + "step": 7700 + }, + { + "entropy": 0.14162486122571863, + "epoch": 0.6812319495579708, + "grad_norm": 6.0, + "learning_rate": 0.00018032031008209502, + "loss": 0.19666408538818358, + "mean_token_accuracy": 0.9620895192027092, + "num_tokens": 8145821.0, + "step": 7725 + }, + { + "entropy": 0.1300551049981732, + "epoch": 0.6834365836989351, + "grad_norm": 2.828125, + "learning_rate": 0.00018017830089899154, + "loss": 0.202286434173584, + "mean_token_accuracy": 0.96599944293499, + "num_tokens": 8172465.0, + "step": 7750 + }, + { + "entropy": 0.13608546175644734, + "epoch": 0.6856412178398995, + "grad_norm": 2.171875, + "learning_rate": 0.00018003583752873283, + "loss": 0.19469423294067384, + "mean_token_accuracy": 0.9608579614758491, + "num_tokens": 8199428.0, + "step": 7775 + }, + { + "entropy": 0.17455945937603246, + "epoch": 0.6878458519808638, + "grad_norm": 7.0625, + "learning_rate": 0.00017989292077833313, + "loss": 0.22791748046875, + "mean_token_accuracy": 0.9548394048213958, + "num_tokens": 8226599.0, + "step": 7800 + }, + { + "entropy": 0.11592018370109144, + "epoch": 0.6900504861218281, + "grad_norm": 5.0625, + "learning_rate": 0.000179749551457375, + "loss": 0.16286245346069336, + "mean_token_accuracy": 0.9658330598473549, + "num_tokens": 8253197.0, + "step": 7825 + }, + { + "entropy": 0.12122963964822703, + "epoch": 0.6922551202627923, + "grad_norm": 8.125, + "learning_rate": 0.00017960573037800463, + "loss": 0.1468034553527832, + "mean_token_accuracy": 0.9656573352217674, + "num_tokens": 8278589.0, + "step": 7850 + }, + { + "entropy": 0.13613144385279155, + "epoch": 0.6944597544037567, + "grad_norm": 2.4375, + "learning_rate": 0.00017946145835492735, + "loss": 0.1826793670654297, + "mean_token_accuracy": 0.9617566013336182, + "num_tokens": 8305687.0, + "step": 7875 + }, + { + "entropy": 0.129446152941091, + "epoch": 0.696664388544721, + "grad_norm": 4.125, + "learning_rate": 0.0001793167362054029, + "loss": 0.19100290298461914, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 8331350.0, + "step": 7900 + }, + { + "entropy": 0.11511508471041453, + "epoch": 0.6988690226856853, + "grad_norm": 3.03125, + "learning_rate": 0.0001791715647492409, + "loss": 0.14485804557800294, + "mean_token_accuracy": 0.9666005888581276, + "num_tokens": 8357509.0, + "step": 7925 + }, + { + "entropy": 0.1282959767448483, + "epoch": 0.7010736568266496, + "grad_norm": 2.734375, + "learning_rate": 0.00017902594480879622, + "loss": 0.17366495132446289, + "mean_token_accuracy": 0.9591986963152885, + "num_tokens": 8384200.0, + "step": 7950 + }, + { + "entropy": 0.1282369938242482, + "epoch": 0.703278290967614, + "grad_norm": 5.53125, + "learning_rate": 0.00017887987720896406, + "loss": 0.1773942756652832, + "mean_token_accuracy": 0.9644914370775223, + "num_tokens": 8410909.0, + "step": 7975 + }, + { + "entropy": 0.1158729085046798, + "epoch": 0.7054829251085782, + "grad_norm": 4.34375, + "learning_rate": 0.00017873336277717574, + "loss": 0.1543659210205078, + "mean_token_accuracy": 0.9665400749444961, + "num_tokens": 8436352.0, + "step": 8000 + }, + { + "epoch": 0.7054829251085782, + "eval_entropy": 0.07064501758729272, + "eval_loss": 0.09072922170162201, + "eval_mean_token_accuracy": 0.9757660988752379, + "eval_num_tokens": 8436352.0, + "eval_runtime": 235.5705, + "eval_samples_per_second": 16.679, + "eval_steps_per_second": 4.173, + "step": 8000 + }, + { + "entropy": 0.11761951618886086, + "epoch": 0.7076875592495425, + "grad_norm": 3.046875, + "learning_rate": 0.0001785864023433936, + "loss": 0.1570788288116455, + "mean_token_accuracy": 0.9673075690865517, + "num_tokens": 8461673.0, + "step": 8025 + }, + { + "entropy": 0.11300159072328825, + "epoch": 0.7098921933905068, + "grad_norm": 3.9375, + "learning_rate": 0.00017843899674010641, + "loss": 0.15068492889404297, + "mean_token_accuracy": 0.9611266851425171, + "num_tokens": 8488491.0, + "step": 8050 + }, + { + "entropy": 0.12430981354729738, + "epoch": 0.7120968275314712, + "grad_norm": 4.1875, + "learning_rate": 0.0001782911468023249, + "loss": 0.17625951766967773, + "mean_token_accuracy": 0.962748230099678, + "num_tokens": 8514512.0, + "step": 8075 + }, + { + "entropy": 0.12553293525765186, + "epoch": 0.7143014616724355, + "grad_norm": 5.3125, + "learning_rate": 0.00017814285336757664, + "loss": 0.16704462051391603, + "mean_token_accuracy": 0.96374351978302, + "num_tokens": 8540515.0, + "step": 8100 + }, + { + "entropy": 0.11906734269752633, + "epoch": 0.7165060958133997, + "grad_norm": 5.375, + "learning_rate": 0.00017799411727590153, + "loss": 0.14968764305114746, + "mean_token_accuracy": 0.9667163553833962, + "num_tokens": 8565892.0, + "step": 8125 + }, + { + "entropy": 0.1386713864444755, + "epoch": 0.718710729954364, + "grad_norm": 4.59375, + "learning_rate": 0.00017784493936984705, + "loss": 0.19567285537719725, + "mean_token_accuracy": 0.9598614898324013, + "num_tokens": 8592214.0, + "step": 8150 + }, + { + "entropy": 0.10956784428621176, + "epoch": 0.7209153640953284, + "grad_norm": 6.59375, + "learning_rate": 0.0001776953204944634, + "loss": 0.1550135135650635, + "mean_token_accuracy": 0.9683659729361535, + "num_tokens": 8618032.0, + "step": 8175 + }, + { + "entropy": 0.11613133529463085, + "epoch": 0.7231199982362927, + "grad_norm": 5.4375, + "learning_rate": 0.00017754526149729868, + "loss": 0.15899131774902345, + "mean_token_accuracy": 0.9638846132159233, + "num_tokens": 8644256.0, + "step": 8200 + }, + { + "entropy": 0.14448100091249216, + "epoch": 0.725324632377257, + "grad_norm": 3.0625, + "learning_rate": 0.00017739476322839427, + "loss": 0.17772165298461914, + "mean_token_accuracy": 0.9560806438326835, + "num_tokens": 8670775.0, + "step": 8225 + }, + { + "entropy": 0.12614294623752356, + "epoch": 0.7275292665182213, + "grad_norm": 4.4375, + "learning_rate": 0.00017724382654027985, + "loss": 0.1638924217224121, + "mean_token_accuracy": 0.9645272868871689, + "num_tokens": 8697753.0, + "step": 8250 + }, + { + "entropy": 0.12692207982821857, + "epoch": 0.7297339006591856, + "grad_norm": 3.265625, + "learning_rate": 0.00017709245228796856, + "loss": 0.1772811508178711, + "mean_token_accuracy": 0.9636184599995613, + "num_tokens": 8724660.0, + "step": 8275 + }, + { + "entropy": 0.12508263059076852, + "epoch": 0.7319385348001499, + "grad_norm": 8.0, + "learning_rate": 0.00017694064132895232, + "loss": 0.16712051391601562, + "mean_token_accuracy": 0.9643837228417397, + "num_tokens": 8750683.0, + "step": 8300 + }, + { + "entropy": 0.14396141051198355, + "epoch": 0.7341431689411142, + "grad_norm": 1.8671875, + "learning_rate": 0.0001767883945231968, + "loss": 0.19680845260620117, + "mean_token_accuracy": 0.9622863036394119, + "num_tokens": 8777248.0, + "step": 8325 + }, + { + "entropy": 0.1027900730050169, + "epoch": 0.7363478030820785, + "grad_norm": 5.3125, + "learning_rate": 0.00017663571273313658, + "loss": 0.15053895950317384, + "mean_token_accuracy": 0.9678994670510292, + "num_tokens": 8803326.0, + "step": 8350 + }, + { + "entropy": 0.1591771748010069, + "epoch": 0.7385524372230429, + "grad_norm": 4.21875, + "learning_rate": 0.00017648259682367042, + "loss": 0.1984667205810547, + "mean_token_accuracy": 0.9540786528587342, + "num_tokens": 8830566.0, + "step": 8375 + }, + { + "entropy": 0.1073954068531748, + "epoch": 0.7407570713640071, + "grad_norm": 6.0, + "learning_rate": 0.00017632904766215618, + "loss": 0.13572030067443847, + "mean_token_accuracy": 0.9658256524801254, + "num_tokens": 8855529.0, + "step": 8400 + }, + { + "entropy": 0.12134030096232891, + "epoch": 0.7429617055049714, + "grad_norm": 2.9375, + "learning_rate": 0.00017617506611840596, + "loss": 0.1593620204925537, + "mean_token_accuracy": 0.9623824095726013, + "num_tokens": 8883268.0, + "step": 8425 + }, + { + "entropy": 0.1269017940975027, + "epoch": 0.7451663396459357, + "grad_norm": 3.171875, + "learning_rate": 0.00017602065306468118, + "loss": 0.15877607345581055, + "mean_token_accuracy": 0.965101033449173, + "num_tokens": 8908917.0, + "step": 8450 + }, + { + "entropy": 0.11544961552310269, + "epoch": 0.7473709737869001, + "grad_norm": 7.375, + "learning_rate": 0.00017586580937568763, + "loss": 0.15559195518493651, + "mean_token_accuracy": 0.9670590379834175, + "num_tokens": 8934706.0, + "step": 8475 + }, + { + "entropy": 0.13363998796266968, + "epoch": 0.7495756079278644, + "grad_norm": 6.84375, + "learning_rate": 0.00017571053592857055, + "loss": 0.19044275283813478, + "mean_token_accuracy": 0.9597599840164185, + "num_tokens": 8962668.0, + "step": 8500 + }, + { + "entropy": 0.11302992556476965, + "epoch": 0.7517802420688287, + "grad_norm": 4.78125, + "learning_rate": 0.00017555483360290968, + "loss": 0.1435883617401123, + "mean_token_accuracy": 0.969543283879757, + "num_tokens": 8987940.0, + "step": 8525 + }, + { + "entropy": 0.12368935061385855, + "epoch": 0.7539848762097929, + "grad_norm": 3.1875, + "learning_rate": 0.0001753987032807141, + "loss": 0.15028656005859375, + "mean_token_accuracy": 0.9645920068025589, + "num_tokens": 9012995.0, + "step": 8550 + }, + { + "entropy": 0.12879173549008555, + "epoch": 0.7561895103507573, + "grad_norm": 5.375, + "learning_rate": 0.00017524214584641756, + "loss": 0.16836380004882812, + "mean_token_accuracy": 0.963255665898323, + "num_tokens": 9039182.0, + "step": 8575 + }, + { + "entropy": 0.13748559794155882, + "epoch": 0.7583941444917216, + "grad_norm": 6.40625, + "learning_rate": 0.0001750851621868731, + "loss": 0.1914215850830078, + "mean_token_accuracy": 0.9576075798273087, + "num_tokens": 9066623.0, + "step": 8600 + }, + { + "entropy": 0.13106511580059305, + "epoch": 0.7605987786326859, + "grad_norm": 5.625, + "learning_rate": 0.00017492775319134828, + "loss": 0.1786070442199707, + "mean_token_accuracy": 0.9630159053206444, + "num_tokens": 9093239.0, + "step": 8625 + }, + { + "entropy": 0.11938790226588025, + "epoch": 0.7628034127736502, + "grad_norm": 2.3125, + "learning_rate": 0.0001747699197515201, + "loss": 0.17017173767089844, + "mean_token_accuracy": 0.9645773348212242, + "num_tokens": 9120282.0, + "step": 8650 + }, + { + "entropy": 0.11365445770847145, + "epoch": 0.7650080469146145, + "grad_norm": 3.4375, + "learning_rate": 0.00017461166276146986, + "loss": 0.16278697967529296, + "mean_token_accuracy": 0.9636175134778022, + "num_tokens": 9146109.0, + "step": 8675 + }, + { + "entropy": 0.11282925648498349, + "epoch": 0.7672126810555788, + "grad_norm": 16.0, + "learning_rate": 0.00017445298311767818, + "loss": 0.1740219497680664, + "mean_token_accuracy": 0.9688749271631241, + "num_tokens": 9172545.0, + "step": 8700 + }, + { + "entropy": 0.154731562990346, + "epoch": 0.7694173151965431, + "grad_norm": 3.765625, + "learning_rate": 0.00017429388171901988, + "loss": 0.20200132369995116, + "mean_token_accuracy": 0.957536070048809, + "num_tokens": 9199798.0, + "step": 8725 + }, + { + "entropy": 0.11487452790664975, + "epoch": 0.7716219493375075, + "grad_norm": 4.53125, + "learning_rate": 0.00017413435946675887, + "loss": 0.16370033264160155, + "mean_token_accuracy": 0.9644585126638412, + "num_tokens": 9226606.0, + "step": 8750 + }, + { + "entropy": 0.12250090444111265, + "epoch": 0.7738265834784718, + "grad_norm": 4.3125, + "learning_rate": 0.00017397441726454312, + "loss": 0.1650483512878418, + "mean_token_accuracy": 0.9627638322114944, + "num_tokens": 9253846.0, + "step": 8775 + }, + { + "entropy": 0.12166486009780783, + "epoch": 0.776031217619436, + "grad_norm": 4.125, + "learning_rate": 0.00017381405601839953, + "loss": 0.15638877868652343, + "mean_token_accuracy": 0.9642614278197289, + "num_tokens": 9280499.0, + "step": 8800 + }, + { + "entropy": 0.14424598600366154, + "epoch": 0.7782358517604003, + "grad_norm": 3.109375, + "learning_rate": 0.0001736532766367287, + "loss": 0.19656600952148437, + "mean_token_accuracy": 0.9637108337879181, + "num_tokens": 9306050.0, + "step": 8825 + }, + { + "entropy": 0.10348674034525175, + "epoch": 0.7804404859013647, + "grad_norm": 3.859375, + "learning_rate": 0.00017349208003029985, + "loss": 0.13371842384338378, + "mean_token_accuracy": 0.9682561081647872, + "num_tokens": 9331578.0, + "step": 8850 + }, + { + "entropy": 0.12325756757461931, + "epoch": 0.782645120042329, + "grad_norm": 3.34375, + "learning_rate": 0.00017333046711224566, + "loss": 0.1538134765625, + "mean_token_accuracy": 0.9658476275205612, + "num_tokens": 9356722.0, + "step": 8875 + }, + { + "entropy": 0.11024700316367671, + "epoch": 0.7848497541832933, + "grad_norm": 3.265625, + "learning_rate": 0.00017316843879805713, + "loss": 0.14064696311950683, + "mean_token_accuracy": 0.9689941748976707, + "num_tokens": 9383085.0, + "step": 8900 + }, + { + "entropy": 0.12162997972161975, + "epoch": 0.7870543883242576, + "grad_norm": 1.5390625, + "learning_rate": 0.0001730059960055784, + "loss": 0.17535165786743165, + "mean_token_accuracy": 0.9637424668669701, + "num_tokens": 9409568.0, + "step": 8925 + }, + { + "entropy": 0.13874562435550616, + "epoch": 0.789259022465222, + "grad_norm": 9.1875, + "learning_rate": 0.00017284313965500135, + "loss": 0.1936268424987793, + "mean_token_accuracy": 0.9646750277280808, + "num_tokens": 9437494.0, + "step": 8950 + }, + { + "entropy": 0.09118764624232427, + "epoch": 0.7914636566061862, + "grad_norm": 0.94921875, + "learning_rate": 0.00017267987066886073, + "loss": 0.09851828575134278, + "mean_token_accuracy": 0.9754169529676437, + "num_tokens": 9462219.0, + "step": 8975 + }, + { + "entropy": 0.12636590646347032, + "epoch": 0.7936682907471505, + "grad_norm": 2.796875, + "learning_rate": 0.00017251618997202864, + "loss": 0.19156661987304688, + "mean_token_accuracy": 0.9624759146571159, + "num_tokens": 9489546.0, + "step": 9000 + }, + { + "epoch": 0.7936682907471505, + "eval_entropy": 0.06704333229587316, + "eval_loss": 0.07078936696052551, + "eval_mean_token_accuracy": 0.9801562535192814, + "eval_num_tokens": 9489546.0, + "eval_runtime": 227.2118, + "eval_samples_per_second": 17.292, + "eval_steps_per_second": 4.326, + "step": 9000 + }, + { + "entropy": 0.12248482231225352, + "epoch": 0.7958729248881148, + "grad_norm": 4.28125, + "learning_rate": 0.0001723520984917095, + "loss": 0.21896730422973631, + "mean_token_accuracy": 0.9658117419481278, + "num_tokens": 9516203.0, + "step": 9025 + }, + { + "entropy": 0.13453102754894644, + "epoch": 0.7980775590290792, + "grad_norm": 3.90625, + "learning_rate": 0.00017218759715743453, + "loss": 0.16762969970703126, + "mean_token_accuracy": 0.9634674605727196, + "num_tokens": 9544002.0, + "step": 9050 + }, + { + "entropy": 0.1296102111687651, + "epoch": 0.8002821931700435, + "grad_norm": 3.71875, + "learning_rate": 0.00017202268690105684, + "loss": 0.1634498977661133, + "mean_token_accuracy": 0.9662830474972725, + "num_tokens": 9570760.0, + "step": 9075 + }, + { + "entropy": 0.09645530783745926, + "epoch": 0.8024868273110077, + "grad_norm": 1.5, + "learning_rate": 0.00017185736865674592, + "loss": 0.1314550495147705, + "mean_token_accuracy": 0.9713372036814689, + "num_tokens": 9597145.0, + "step": 9100 + }, + { + "entropy": 0.1334079783054767, + "epoch": 0.804691461451972, + "grad_norm": 3.859375, + "learning_rate": 0.00017169164336098227, + "loss": 0.18433303833007814, + "mean_token_accuracy": 0.9601713407039643, + "num_tokens": 9623937.0, + "step": 9125 + }, + { + "entropy": 0.12334824909456074, + "epoch": 0.8068960955929364, + "grad_norm": 4.03125, + "learning_rate": 0.0001715255119525524, + "loss": 0.16472867965698243, + "mean_token_accuracy": 0.9639821222424507, + "num_tokens": 9650956.0, + "step": 9150 + }, + { + "entropy": 0.11855432161130011, + "epoch": 0.8091007297339007, + "grad_norm": 5.90625, + "learning_rate": 0.0001713589753725432, + "loss": 0.16404468536376954, + "mean_token_accuracy": 0.9677329239249229, + "num_tokens": 9677654.0, + "step": 9175 + }, + { + "entropy": 0.13992778839776293, + "epoch": 0.811305363874865, + "grad_norm": 2.203125, + "learning_rate": 0.00017119203456433682, + "loss": 0.17372175216674804, + "mean_token_accuracy": 0.9608005279302597, + "num_tokens": 9703015.0, + "step": 9200 + }, + { + "entropy": 0.11712702068965881, + "epoch": 0.8135099980158292, + "grad_norm": 2.171875, + "learning_rate": 0.00017102469047360525, + "loss": 0.14785463333129883, + "mean_token_accuracy": 0.969508444070816, + "num_tokens": 9728261.0, + "step": 9225 + }, + { + "entropy": 0.12233627852867357, + "epoch": 0.8157146321567936, + "grad_norm": 2.5625, + "learning_rate": 0.00017085694404830485, + "loss": 0.16637359619140624, + "mean_token_accuracy": 0.9676308873295784, + "num_tokens": 9754044.0, + "step": 9250 + }, + { + "entropy": 0.11624026085250079, + "epoch": 0.8179192662977579, + "grad_norm": 14.375, + "learning_rate": 0.00017068879623867122, + "loss": 0.18700075149536133, + "mean_token_accuracy": 0.9679171699285507, + "num_tokens": 9780803.0, + "step": 9275 + }, + { + "entropy": 0.12290247712633573, + "epoch": 0.8201239004387222, + "grad_norm": 2.03125, + "learning_rate": 0.0001705202479972136, + "loss": 0.15535794258117674, + "mean_token_accuracy": 0.9672988221049309, + "num_tokens": 9806834.0, + "step": 9300 + }, + { + "entropy": 0.11379165446502157, + "epoch": 0.8223285345796865, + "grad_norm": 2.875, + "learning_rate": 0.00017035130027870965, + "loss": 0.13857620239257812, + "mean_token_accuracy": 0.9699993515014649, + "num_tokens": 9833002.0, + "step": 9325 + }, + { + "entropy": 0.10841709365835413, + "epoch": 0.8245331687206509, + "grad_norm": 2.40625, + "learning_rate": 0.00017018195404019983, + "loss": 0.1723048782348633, + "mean_token_accuracy": 0.9657841363549232, + "num_tokens": 9859181.0, + "step": 9350 + }, + { + "entropy": 0.11595107901492156, + "epoch": 0.8267378028616151, + "grad_norm": 4.8125, + "learning_rate": 0.00017001221024098224, + "loss": 0.1496266269683838, + "mean_token_accuracy": 0.9674661475419998, + "num_tokens": 9886731.0, + "step": 9375 + }, + { + "entropy": 0.11216982064011972, + "epoch": 0.8289424370025794, + "grad_norm": 4.25, + "learning_rate": 0.00016984206984260695, + "loss": 0.14420658111572265, + "mean_token_accuracy": 0.9676317548751832, + "num_tokens": 9911898.0, + "step": 9400 + }, + { + "entropy": 0.11021179292700253, + "epoch": 0.8311470711435437, + "grad_norm": 5.65625, + "learning_rate": 0.00016967153380887068, + "loss": 0.16620290756225586, + "mean_token_accuracy": 0.964969280064106, + "num_tokens": 9938802.0, + "step": 9425 + }, + { + "entropy": 0.11348463983624242, + "epoch": 0.8333517052845081, + "grad_norm": 2.0, + "learning_rate": 0.00016950060310581133, + "loss": 0.1321702194213867, + "mean_token_accuracy": 0.9677722403407096, + "num_tokens": 9964208.0, + "step": 9450 + }, + { + "entropy": 0.11066792839614209, + "epoch": 0.8355563394254724, + "grad_norm": 5.0, + "learning_rate": 0.00016932927870170244, + "loss": 0.1695878028869629, + "mean_token_accuracy": 0.962311232984066, + "num_tokens": 9992411.0, + "step": 9475 + }, + { + "entropy": 0.11070761349285022, + "epoch": 0.8377609735664366, + "grad_norm": 5.28125, + "learning_rate": 0.0001691575615670478, + "loss": 0.1375373935699463, + "mean_token_accuracy": 0.9715418082475662, + "num_tokens": 10017700.0, + "step": 9500 + }, + { + "entropy": 0.11562138024251908, + "epoch": 0.8399656077074009, + "grad_norm": 6.5625, + "learning_rate": 0.0001689854526745759, + "loss": 0.15139179229736327, + "mean_token_accuracy": 0.9657182011008263, + "num_tokens": 10044558.0, + "step": 9525 + }, + { + "entropy": 0.11778819782077335, + "epoch": 0.8421702418483653, + "grad_norm": 2.75, + "learning_rate": 0.00016881295299923444, + "loss": 0.1503695774078369, + "mean_token_accuracy": 0.9710509559512138, + "num_tokens": 10071358.0, + "step": 9550 + }, + { + "entropy": 0.1135219477955252, + "epoch": 0.8443748759893296, + "grad_norm": 2.8125, + "learning_rate": 0.00016864006351818473, + "loss": 0.1394341278076172, + "mean_token_accuracy": 0.9689531043171883, + "num_tokens": 10096955.0, + "step": 9575 + }, + { + "entropy": 0.10299550660944078, + "epoch": 0.8465795101302939, + "grad_norm": 3.984375, + "learning_rate": 0.00016846678521079627, + "loss": 0.12168859481811524, + "mean_token_accuracy": 0.9724226155877114, + "num_tokens": 10123250.0, + "step": 9600 + }, + { + "entropy": 0.11091373673116323, + "epoch": 0.8487841442712581, + "grad_norm": 3.078125, + "learning_rate": 0.00016829311905864114, + "loss": 0.16308366775512695, + "mean_token_accuracy": 0.967094420492649, + "num_tokens": 10150855.0, + "step": 9625 + }, + { + "entropy": 0.11484327328740619, + "epoch": 0.8509887784122225, + "grad_norm": 5.40625, + "learning_rate": 0.00016811906604548846, + "loss": 0.14888720512390136, + "mean_token_accuracy": 0.9673416930437088, + "num_tokens": 10178188.0, + "step": 9650 + }, + { + "entropy": 0.12452099201036618, + "epoch": 0.8531934125531868, + "grad_norm": 4.53125, + "learning_rate": 0.00016794462715729878, + "loss": 0.16681873321533203, + "mean_token_accuracy": 0.9647532105445862, + "num_tokens": 10205220.0, + "step": 9675 + }, + { + "entropy": 0.0996164298261283, + "epoch": 0.8553980466941511, + "grad_norm": 3.9375, + "learning_rate": 0.0001677698033822185, + "loss": 0.1467697811126709, + "mean_token_accuracy": 0.9715581625699997, + "num_tokens": 10231496.0, + "step": 9700 + }, + { + "entropy": 0.10809524114127271, + "epoch": 0.8576026808351154, + "grad_norm": 4.625, + "learning_rate": 0.00016759459571057445, + "loss": 0.14575215339660644, + "mean_token_accuracy": 0.9664224565029145, + "num_tokens": 10257436.0, + "step": 9725 + }, + { + "entropy": 0.1113849281903822, + "epoch": 0.8598073149760798, + "grad_norm": 3.84375, + "learning_rate": 0.0001674190051348679, + "loss": 0.1565341281890869, + "mean_token_accuracy": 0.9675761547684669, + "num_tokens": 10283162.0, + "step": 9750 + }, + { + "entropy": 0.13911948825232684, + "epoch": 0.862011949117044, + "grad_norm": 4.96875, + "learning_rate": 0.00016724303264976928, + "loss": 0.20747381210327148, + "mean_token_accuracy": 0.9587804532051086, + "num_tokens": 10310772.0, + "step": 9775 + }, + { + "entropy": 0.09862252110615373, + "epoch": 0.8642165832580083, + "grad_norm": 3.984375, + "learning_rate": 0.00016706667925211246, + "loss": 0.11554966926574707, + "mean_token_accuracy": 0.9711972030997277, + "num_tokens": 10337443.0, + "step": 9800 + }, + { + "entropy": 0.12474286954733543, + "epoch": 0.8664212173989726, + "grad_norm": 3.578125, + "learning_rate": 0.00016688994594088897, + "loss": 0.16256399154663087, + "mean_token_accuracy": 0.9645921823382377, + "num_tokens": 10364440.0, + "step": 9825 + }, + { + "entropy": 0.11431661496520974, + "epoch": 0.868625851539937, + "grad_norm": 1.1484375, + "learning_rate": 0.00016671283371724258, + "loss": 0.17474599838256835, + "mean_token_accuracy": 0.9650062435865402, + "num_tokens": 10390382.0, + "step": 9850 + }, + { + "entropy": 0.11188884009723551, + "epoch": 0.8708304856809013, + "grad_norm": 5.40625, + "learning_rate": 0.00016653534358446333, + "loss": 0.16477123260498047, + "mean_token_accuracy": 0.9655068427324295, + "num_tokens": 10417334.0, + "step": 9875 + }, + { + "entropy": 0.13186935628706123, + "epoch": 0.8730351198218655, + "grad_norm": 4.0625, + "learning_rate": 0.0001663574765479821, + "loss": 0.16050004959106445, + "mean_token_accuracy": 0.9649894621968269, + "num_tokens": 10445100.0, + "step": 9900 + }, + { + "entropy": 0.09267763423384168, + "epoch": 0.8752397539628298, + "grad_norm": 3.03125, + "learning_rate": 0.00016617923361536481, + "loss": 0.12584033966064453, + "mean_token_accuracy": 0.9726918703317642, + "num_tokens": 10471396.0, + "step": 9925 + }, + { + "entropy": 0.10969781915540806, + "epoch": 0.8774443881037942, + "grad_norm": 4.28125, + "learning_rate": 0.00016600061579630682, + "loss": 0.1531198787689209, + "mean_token_accuracy": 0.9685003688931465, + "num_tokens": 10496896.0, + "step": 9950 + }, + { + "entropy": 0.10426673835027031, + "epoch": 0.8796490222447585, + "grad_norm": 2.71875, + "learning_rate": 0.00016582162410262683, + "loss": 0.141700439453125, + "mean_token_accuracy": 0.9694043102860451, + "num_tokens": 10524052.0, + "step": 9975 + }, + { + "entropy": 0.10985518074186984, + "epoch": 0.8818536563857228, + "grad_norm": 5.5625, + "learning_rate": 0.0001656422595482618, + "loss": 0.1478671932220459, + "mean_token_accuracy": 0.9703094553947449, + "num_tokens": 10550484.0, + "step": 10000 + }, + { + "epoch": 0.8818536563857228, + "eval_entropy": 0.06747981444776195, + "eval_loss": 0.06752390414476395, + "eval_mean_token_accuracy": 0.9817088659272917, + "eval_num_tokens": 10550484.0, + "eval_runtime": 245.6253, + "eval_samples_per_second": 15.996, + "eval_steps_per_second": 4.002, + "step": 10000 + }, + { + "entropy": 0.12341946017812006, + "epoch": 0.884058290526687, + "grad_norm": 1.953125, + "learning_rate": 0.0001654625231492605, + "loss": 0.16616138458251953, + "mean_token_accuracy": 0.967511080801487, + "num_tokens": 10577372.0, + "step": 10025 + }, + { + "entropy": 0.13170893985428847, + "epoch": 0.8862629246676514, + "grad_norm": 3.640625, + "learning_rate": 0.00016528241592377838, + "loss": 0.19268482208251952, + "mean_token_accuracy": 0.9656442356109619, + "num_tokens": 10604954.0, + "step": 10050 + }, + { + "entropy": 0.10221802647865844, + "epoch": 0.8884675588086157, + "grad_norm": 5.0625, + "learning_rate": 0.00016510193889207128, + "loss": 0.14701667785644532, + "mean_token_accuracy": 0.9697395420074463, + "num_tokens": 10631480.0, + "step": 10075 + }, + { + "entropy": 0.11693994878907688, + "epoch": 0.89067219294958, + "grad_norm": 3.53125, + "learning_rate": 0.00016492109307649012, + "loss": 0.1495195198059082, + "mean_token_accuracy": 0.9655201029777527, + "num_tokens": 10657009.0, + "step": 10100 + }, + { + "entropy": 0.1266687322355574, + "epoch": 0.8928768270905443, + "grad_norm": 3.015625, + "learning_rate": 0.00016473987950147464, + "loss": 0.1930523109436035, + "mean_token_accuracy": 0.9648179452121258, + "num_tokens": 10684895.0, + "step": 10125 + }, + { + "entropy": 0.1290986440368579, + "epoch": 0.8950814612315087, + "grad_norm": 3.40625, + "learning_rate": 0.00016455829919354798, + "loss": 0.15923919677734374, + "mean_token_accuracy": 0.9646768179535866, + "num_tokens": 10712424.0, + "step": 10150 + }, + { + "entropy": 0.09042322851746576, + "epoch": 0.897286095372473, + "grad_norm": 4.28125, + "learning_rate": 0.00016437635318131068, + "loss": 0.1193246841430664, + "mean_token_accuracy": 0.9700718694925308, + "num_tokens": 10738436.0, + "step": 10175 + }, + { + "entropy": 0.11201601465581916, + "epoch": 0.8994907295134372, + "grad_norm": 6.5625, + "learning_rate": 0.0001641940424954349, + "loss": 0.1540191078186035, + "mean_token_accuracy": 0.9686597174406052, + "num_tokens": 10765461.0, + "step": 10200 + }, + { + "entropy": 0.11977429527789354, + "epoch": 0.9016953636544015, + "grad_norm": 2.703125, + "learning_rate": 0.00016401136816865858, + "loss": 0.1758272933959961, + "mean_token_accuracy": 0.9664454838633537, + "num_tokens": 10792485.0, + "step": 10225 + }, + { + "entropy": 0.1012404073565267, + "epoch": 0.9038999977953659, + "grad_norm": 1.5390625, + "learning_rate": 0.0001638283312357795, + "loss": 0.12225442886352539, + "mean_token_accuracy": 0.9731503677368164, + "num_tokens": 10817757.0, + "step": 10250 + }, + { + "entropy": 0.13167083914973773, + "epoch": 0.9061046319363302, + "grad_norm": 4.3125, + "learning_rate": 0.0001636449327336496, + "loss": 0.17429544448852538, + "mean_token_accuracy": 0.9628306743502617, + "num_tokens": 10845750.0, + "step": 10275 + }, + { + "entropy": 0.08767559766652994, + "epoch": 0.9083092660772945, + "grad_norm": 3.96875, + "learning_rate": 0.00016346117370116898, + "loss": 0.10843348503112793, + "mean_token_accuracy": 0.9764333325624466, + "num_tokens": 10871753.0, + "step": 10300 + }, + { + "entropy": 0.10268983447313076, + "epoch": 0.9105139002182587, + "grad_norm": 1.4921875, + "learning_rate": 0.00016327705517927998, + "loss": 0.15269672393798828, + "mean_token_accuracy": 0.969476004242897, + "num_tokens": 10897973.0, + "step": 10325 + }, + { + "entropy": 0.11594196151621873, + "epoch": 0.9127185343592231, + "grad_norm": 3.171875, + "learning_rate": 0.0001630925782109615, + "loss": 0.14017095565795898, + "mean_token_accuracy": 0.9712727320194244, + "num_tokens": 10925576.0, + "step": 10350 + }, + { + "entropy": 0.09633901939727366, + "epoch": 0.9149231685001874, + "grad_norm": 3.46875, + "learning_rate": 0.0001629077438412227, + "loss": 0.12564464569091796, + "mean_token_accuracy": 0.9723188054561615, + "num_tokens": 10953120.0, + "step": 10375 + }, + { + "entropy": 0.09749640888854628, + "epoch": 0.9171278026411517, + "grad_norm": 1.3125, + "learning_rate": 0.00016272255311709755, + "loss": 0.12157401084899902, + "mean_token_accuracy": 0.9729098379611969, + "num_tokens": 10980088.0, + "step": 10400 + }, + { + "entropy": 0.09341563728055917, + "epoch": 0.919332436782116, + "grad_norm": 3.640625, + "learning_rate": 0.00016253700708763848, + "loss": 0.12513632774353028, + "mean_token_accuracy": 0.9720851957798005, + "num_tokens": 11006112.0, + "step": 10425 + }, + { + "entropy": 0.11993184727121843, + "epoch": 0.9215370709230803, + "grad_norm": 4.375, + "learning_rate": 0.0001623511068039108, + "loss": 0.19810443878173828, + "mean_token_accuracy": 0.963067757487297, + "num_tokens": 11035186.0, + "step": 10450 + }, + { + "entropy": 0.10798994577955455, + "epoch": 0.9237417050640446, + "grad_norm": 1.0546875, + "learning_rate": 0.00016216485331898642, + "loss": 0.1386628818511963, + "mean_token_accuracy": 0.969612886607647, + "num_tokens": 11061651.0, + "step": 10475 + }, + { + "entropy": 0.11358262816502247, + "epoch": 0.9259463392050089, + "grad_norm": 1.2578125, + "learning_rate": 0.0001619782476879381, + "loss": 0.19460039138793944, + "mean_token_accuracy": 0.966367999613285, + "num_tokens": 11088569.0, + "step": 10500 + }, + { + "entropy": 0.1167042134446092, + "epoch": 0.9281509733459732, + "grad_norm": 5.78125, + "learning_rate": 0.0001617912909678334, + "loss": 0.15505099296569824, + "mean_token_accuracy": 0.9686264345049858, + "num_tokens": 11115599.0, + "step": 10525 + }, + { + "entropy": 0.1133784833503887, + "epoch": 0.9303556074869376, + "grad_norm": 2.28125, + "learning_rate": 0.00016160398421772868, + "loss": 0.1678891372680664, + "mean_token_accuracy": 0.9675005570054054, + "num_tokens": 11142314.0, + "step": 10550 + }, + { + "entropy": 0.09698103930742946, + "epoch": 0.9325602416279019, + "grad_norm": 3.21875, + "learning_rate": 0.0001614163284986632, + "loss": 0.14286434173583984, + "mean_token_accuracy": 0.969823080599308, + "num_tokens": 11168533.0, + "step": 10575 + }, + { + "entropy": 0.10122561583353672, + "epoch": 0.9347648757688661, + "grad_norm": 3.140625, + "learning_rate": 0.0001612283248736529, + "loss": 0.17256240844726561, + "mean_token_accuracy": 0.969860375225544, + "num_tokens": 11194503.0, + "step": 10600 + }, + { + "entropy": 0.10910014692519326, + "epoch": 0.9369695099098304, + "grad_norm": 1.1796875, + "learning_rate": 0.00016103997440768456, + "loss": 0.1422108840942383, + "mean_token_accuracy": 0.9693578332662582, + "num_tokens": 11219802.0, + "step": 10625 + }, + { + "entropy": 0.11992133250838378, + "epoch": 0.9391741440507948, + "grad_norm": 3.390625, + "learning_rate": 0.0001608512781677098, + "loss": 0.16429296493530274, + "mean_token_accuracy": 0.9664942741394043, + "num_tokens": 11248011.0, + "step": 10650 + }, + { + "entropy": 0.10503609779349063, + "epoch": 0.9413787781917591, + "grad_norm": 4.53125, + "learning_rate": 0.00016066223722263883, + "loss": 0.14775081634521484, + "mean_token_accuracy": 0.9697583362460136, + "num_tokens": 11275171.0, + "step": 10675 + }, + { + "entropy": 0.10717430006247014, + "epoch": 0.9435834123327234, + "grad_norm": 2.96875, + "learning_rate": 0.00016047285264333462, + "loss": 0.13536286354064941, + "mean_token_accuracy": 0.9718959403038024, + "num_tokens": 11302493.0, + "step": 10700 + }, + { + "entropy": 0.12358801149821375, + "epoch": 0.9457880464736876, + "grad_norm": 5.09375, + "learning_rate": 0.00016028312550260657, + "loss": 0.16921548843383788, + "mean_token_accuracy": 0.9650797703862191, + "num_tokens": 11330172.0, + "step": 10725 + }, + { + "entropy": 0.09451631994277704, + "epoch": 0.947992680614652, + "grad_norm": 0.84375, + "learning_rate": 0.00016009305687520478, + "loss": 0.13955138206481935, + "mean_token_accuracy": 0.9695536935329437, + "num_tokens": 11356828.0, + "step": 10750 + }, + { + "entropy": 0.09438925628404832, + "epoch": 0.9501973147556163, + "grad_norm": 5.03125, + "learning_rate": 0.00015990264783781366, + "loss": 0.13763263702392578, + "mean_token_accuracy": 0.9733442470431328, + "num_tokens": 11383487.0, + "step": 10775 + }, + { + "entropy": 0.10031276696827263, + "epoch": 0.9524019488965806, + "grad_norm": 2.65625, + "learning_rate": 0.00015971189946904603, + "loss": 0.11601199150085449, + "mean_token_accuracy": 0.9713838744163513, + "num_tokens": 11408411.0, + "step": 10800 + }, + { + "entropy": 0.08611906496749726, + "epoch": 0.9546065830375449, + "grad_norm": 3.390625, + "learning_rate": 0.0001595208128494368, + "loss": 0.12846466064453124, + "mean_token_accuracy": 0.974527842104435, + "num_tokens": 11434288.0, + "step": 10825 + }, + { + "entropy": 0.09868820410221815, + "epoch": 0.9568112171785093, + "grad_norm": 4.6875, + "learning_rate": 0.0001593293890614371, + "loss": 0.14539647102355957, + "mean_token_accuracy": 0.9727686950564385, + "num_tokens": 11461369.0, + "step": 10850 + }, + { + "entropy": 0.10726987033616751, + "epoch": 0.9590158513194735, + "grad_norm": 1.6328125, + "learning_rate": 0.00015913762918940796, + "loss": 0.15492965698242187, + "mean_token_accuracy": 0.9678599080443382, + "num_tokens": 11487252.0, + "step": 10875 + }, + { + "entropy": 0.11905311323818751, + "epoch": 0.9612204854604378, + "grad_norm": 3.375, + "learning_rate": 0.0001589455343196142, + "loss": 0.1703525733947754, + "mean_token_accuracy": 0.9683392706513405, + "num_tokens": 11513191.0, + "step": 10900 + }, + { + "entropy": 0.09980171727540438, + "epoch": 0.9634251196014022, + "grad_norm": 0.416015625, + "learning_rate": 0.0001587531055402184, + "loss": 0.140428409576416, + "mean_token_accuracy": 0.9712198379635811, + "num_tokens": 11538104.0, + "step": 10925 + }, + { + "entropy": 0.08444582270516549, + "epoch": 0.9656297537423665, + "grad_norm": 2.921875, + "learning_rate": 0.00015856034394127458, + "loss": 0.11888874053955079, + "mean_token_accuracy": 0.9720085749030113, + "num_tokens": 11563627.0, + "step": 10950 + }, + { + "entropy": 0.08891735835233704, + "epoch": 0.9678343878833308, + "grad_norm": 5.6875, + "learning_rate": 0.00015836725061472206, + "loss": 0.11186134338378906, + "mean_token_accuracy": 0.9750255295634269, + "num_tokens": 11588193.0, + "step": 10975 + }, + { + "entropy": 0.0976211961361696, + "epoch": 0.970039022024295, + "grad_norm": 5.65625, + "learning_rate": 0.00015817382665437934, + "loss": 0.13658031463623047, + "mean_token_accuracy": 0.9715496882796287, + "num_tokens": 11613550.0, + "step": 11000 + }, + { + "epoch": 0.970039022024295, + "eval_entropy": 0.04970914650081239, + "eval_loss": 0.05864229425787926, + "eval_mean_token_accuracy": 0.9834782193175168, + "eval_num_tokens": 11613550.0, + "eval_runtime": 244.2638, + "eval_samples_per_second": 16.085, + "eval_steps_per_second": 4.024, + "step": 11000 + }, + { + "entropy": 0.09031637163367122, + "epoch": 0.9722436561652594, + "grad_norm": 6.5, + "learning_rate": 0.00015798007315593793, + "loss": 0.13898273468017577, + "mean_token_accuracy": 0.973123728632927, + "num_tokens": 11640911.0, + "step": 11025 + }, + { + "entropy": 0.10300864099699539, + "epoch": 0.9744482903062237, + "grad_norm": 1.9140625, + "learning_rate": 0.00015778599121695592, + "loss": 0.14298954963684082, + "mean_token_accuracy": 0.9674796098470688, + "num_tokens": 11667215.0, + "step": 11050 + }, + { + "entropy": 0.0918053998821415, + "epoch": 0.976652924447188, + "grad_norm": 2.53125, + "learning_rate": 0.0001575915819368521, + "loss": 0.1144057846069336, + "mean_token_accuracy": 0.9736630937457085, + "num_tokens": 11693740.0, + "step": 11075 + }, + { + "entropy": 0.0991861937311478, + "epoch": 0.9788575585881523, + "grad_norm": 3.25, + "learning_rate": 0.0001573968464168994, + "loss": 0.1382896327972412, + "mean_token_accuracy": 0.9709669390320778, + "num_tokens": 11721792.0, + "step": 11100 + }, + { + "entropy": 0.10513557809928897, + "epoch": 0.9810621927291167, + "grad_norm": 1.2421875, + "learning_rate": 0.00015720178576021892, + "loss": 0.14281551361083986, + "mean_token_accuracy": 0.9703371456265449, + "num_tokens": 11748074.0, + "step": 11125 + }, + { + "entropy": 0.07274828981404426, + "epoch": 0.9832668268700809, + "grad_norm": 3.640625, + "learning_rate": 0.0001570064010717735, + "loss": 0.08963616371154785, + "mean_token_accuracy": 0.9772802451252938, + "num_tokens": 11774661.0, + "step": 11150 + }, + { + "entropy": 0.09237290589720942, + "epoch": 0.9854714610110452, + "grad_norm": 0.8515625, + "learning_rate": 0.00015681069345836157, + "loss": 0.12891390800476074, + "mean_token_accuracy": 0.9716602417826653, + "num_tokens": 11802101.0, + "step": 11175 + }, + { + "entropy": 0.10531749608111568, + "epoch": 0.9876760951520095, + "grad_norm": 1.6953125, + "learning_rate": 0.0001566146640286108, + "loss": 0.16108951568603516, + "mean_token_accuracy": 0.9681805393099785, + "num_tokens": 11828012.0, + "step": 11200 + }, + { + "entropy": 0.10035160956729669, + "epoch": 0.9898807292929739, + "grad_norm": 3.578125, + "learning_rate": 0.00015641831389297188, + "loss": 0.1179033374786377, + "mean_token_accuracy": 0.9694277629256248, + "num_tokens": 11854834.0, + "step": 11225 + }, + { + "entropy": 0.10796484625694575, + "epoch": 0.9920853634339382, + "grad_norm": 3.46875, + "learning_rate": 0.00015622164416371218, + "loss": 0.16446849822998047, + "mean_token_accuracy": 0.9640506872534752, + "num_tokens": 11882106.0, + "step": 11250 + }, + { + "entropy": 0.1411357653059531, + "epoch": 0.9942899975749024, + "grad_norm": 1.1484375, + "learning_rate": 0.00015602465595490953, + "loss": 0.19301389694213866, + "mean_token_accuracy": 0.9607040500640869, + "num_tokens": 11909716.0, + "step": 11275 + }, + { + "entropy": 0.0925849473947892, + "epoch": 0.9964946317158667, + "grad_norm": 2.015625, + "learning_rate": 0.00015582735038244575, + "loss": 0.1028307056427002, + "mean_token_accuracy": 0.9762453472614289, + "num_tokens": 11935816.0, + "step": 11300 + }, + { + "entropy": 0.08189033437593025, + "epoch": 0.9986992658568311, + "grad_norm": 2.296875, + "learning_rate": 0.00015562972856400062, + "loss": 0.1367438793182373, + "mean_token_accuracy": 0.972346597313881, + "num_tokens": 11961616.0, + "step": 11325 + }, + { + "entropy": 0.11836909589115437, + "epoch": 1.0008818536563857, + "grad_norm": 2.15625, + "learning_rate": 0.00015543179161904517, + "loss": 0.1542993450164795, + "mean_token_accuracy": 0.9657085614674019, + "num_tokens": 11987237.0, + "step": 11350 + }, + { + "entropy": 0.08593971867026994, + "epoch": 1.00308648779735, + "grad_norm": 3.640625, + "learning_rate": 0.00015523354066883567, + "loss": 0.1126481533050537, + "mean_token_accuracy": 0.97596212297678, + "num_tokens": 12013371.0, + "step": 11375 + }, + { + "entropy": 0.07548106749542058, + "epoch": 1.0052911219383143, + "grad_norm": 2.765625, + "learning_rate": 0.000155034976836407, + "loss": 0.11987373352050781, + "mean_token_accuracy": 0.9775131213665008, + "num_tokens": 12039685.0, + "step": 11400 + }, + { + "entropy": 0.07430520750611322, + "epoch": 1.0074957560792785, + "grad_norm": 5.3125, + "learning_rate": 0.0001548361012465666, + "loss": 0.10026315689086913, + "mean_token_accuracy": 0.9790066388249398, + "num_tokens": 12065476.0, + "step": 11425 + }, + { + "entropy": 0.07392804217728553, + "epoch": 1.009700390220243, + "grad_norm": 2.984375, + "learning_rate": 0.00015463691502588778, + "loss": 0.1030975341796875, + "mean_token_accuracy": 0.9764667934179306, + "num_tokens": 12092089.0, + "step": 11450 + }, + { + "entropy": 0.0857640357335913, + "epoch": 1.0119050243612073, + "grad_norm": 1.890625, + "learning_rate": 0.00015443741930270353, + "loss": 0.11760306358337402, + "mean_token_accuracy": 0.9736816781759262, + "num_tokens": 12119248.0, + "step": 11475 + }, + { + "entropy": 0.09214958660071716, + "epoch": 1.0141096585021716, + "grad_norm": 4.0625, + "learning_rate": 0.00015423761520710015, + "loss": 0.12875985145568847, + "mean_token_accuracy": 0.9710692197084427, + "num_tokens": 12146638.0, + "step": 11500 + }, + { + "entropy": 0.09558408751501701, + "epoch": 1.0163142926431359, + "grad_norm": 0.99609375, + "learning_rate": 0.00015403750387091072, + "loss": 0.11520135879516602, + "mean_token_accuracy": 0.9736033695936203, + "num_tokens": 12174242.0, + "step": 11525 + }, + { + "entropy": 0.08030253475852078, + "epoch": 1.0185189267841002, + "grad_norm": 1.7265625, + "learning_rate": 0.0001538370864277087, + "loss": 0.109821138381958, + "mean_token_accuracy": 0.97515142172575, + "num_tokens": 12201776.0, + "step": 11550 + }, + { + "entropy": 0.10750156398629769, + "epoch": 1.0207235609250644, + "grad_norm": 5.6875, + "learning_rate": 0.00015363636401280166, + "loss": 0.16224815368652343, + "mean_token_accuracy": 0.9691583275794983, + "num_tokens": 12229652.0, + "step": 11575 + }, + { + "entropy": 0.08844495263532735, + "epoch": 1.0229281950660287, + "grad_norm": 2.765625, + "learning_rate": 0.00015343533776322467, + "loss": 0.11790482521057129, + "mean_token_accuracy": 0.9742652297019958, + "num_tokens": 12256478.0, + "step": 11600 + }, + { + "entropy": 0.08379628753580619, + "epoch": 1.025132829206993, + "grad_norm": 13.25, + "learning_rate": 0.000153234008817734, + "loss": 0.12678858757019043, + "mean_token_accuracy": 0.9716950944066047, + "num_tokens": 12283290.0, + "step": 11625 + }, + { + "entropy": 0.0752360312268138, + "epoch": 1.0273374633479575, + "grad_norm": 2.140625, + "learning_rate": 0.00015303237831680057, + "loss": 0.10330279350280762, + "mean_token_accuracy": 0.9772691050171852, + "num_tokens": 12309083.0, + "step": 11650 + }, + { + "entropy": 0.10299436590634287, + "epoch": 1.0295420974889218, + "grad_norm": 1.6171875, + "learning_rate": 0.00015283044740260358, + "loss": 0.13117281913757325, + "mean_token_accuracy": 0.9719769659638405, + "num_tokens": 12335892.0, + "step": 11675 + }, + { + "entropy": 0.08254285944451112, + "epoch": 1.031746731629886, + "grad_norm": 3.3125, + "learning_rate": 0.00015262821721902383, + "loss": 0.1227413558959961, + "mean_token_accuracy": 0.9741689735651016, + "num_tokens": 12361808.0, + "step": 11700 + }, + { + "entropy": 0.08000514879269759, + "epoch": 1.0339513657708503, + "grad_norm": 3.5625, + "learning_rate": 0.00015242568891163757, + "loss": 0.10524334907531738, + "mean_token_accuracy": 0.977083123922348, + "num_tokens": 12388433.0, + "step": 11725 + }, + { + "entropy": 0.07193909519643057, + "epoch": 1.0361559999118146, + "grad_norm": 4.40625, + "learning_rate": 0.00015222286362770975, + "loss": 0.09867951393127442, + "mean_token_accuracy": 0.9776046532392502, + "num_tokens": 12415363.0, + "step": 11750 + }, + { + "entropy": 0.06196809031680459, + "epoch": 1.038360634052779, + "grad_norm": 3.4375, + "learning_rate": 0.00015201974251618765, + "loss": 0.09109333992004394, + "mean_token_accuracy": 0.9808389428257942, + "num_tokens": 12441196.0, + "step": 11775 + }, + { + "entropy": 0.07480443540145643, + "epoch": 1.0405652681937432, + "grad_norm": 1.8125, + "learning_rate": 0.00015181632672769428, + "loss": 0.1080815315246582, + "mean_token_accuracy": 0.9766770607233047, + "num_tokens": 12466919.0, + "step": 11800 + }, + { + "entropy": 0.07268392765254247, + "epoch": 1.0427699023347075, + "grad_norm": 3.765625, + "learning_rate": 0.0001516126174145219, + "loss": 0.11195050239562988, + "mean_token_accuracy": 0.9795046743750572, + "num_tokens": 12492043.0, + "step": 11825 + }, + { + "entropy": 0.07267954504699446, + "epoch": 1.044974536475672, + "grad_norm": 3.5, + "learning_rate": 0.0001514086157306256, + "loss": 0.08853129386901855, + "mean_token_accuracy": 0.9804096031188965, + "num_tokens": 12516405.0, + "step": 11850 + }, + { + "entropy": 0.07731569488940294, + "epoch": 1.0471791706166362, + "grad_norm": 3.640625, + "learning_rate": 0.00015120432283161658, + "loss": 0.10150125503540039, + "mean_token_accuracy": 0.9787563198804855, + "num_tokens": 12543253.0, + "step": 11875 + }, + { + "entropy": 0.07172672248416348, + "epoch": 1.0493838047576005, + "grad_norm": 1.7890625, + "learning_rate": 0.0001509997398747557, + "loss": 0.10923091888427734, + "mean_token_accuracy": 0.9780178043246269, + "num_tokens": 12569500.0, + "step": 11900 + }, + { + "entropy": 0.07830634732585168, + "epoch": 1.0515884388985648, + "grad_norm": 4.0625, + "learning_rate": 0.00015079486801894697, + "loss": 0.1168800163269043, + "mean_token_accuracy": 0.9760348123311996, + "num_tokens": 12595946.0, + "step": 11925 + }, + { + "entropy": 0.08794710203001159, + "epoch": 1.053793073039529, + "grad_norm": 2.4375, + "learning_rate": 0.00015058970842473087, + "loss": 0.11576443672180176, + "mean_token_accuracy": 0.9739569133520126, + "num_tokens": 12622062.0, + "step": 11950 + }, + { + "entropy": 0.0924138590262737, + "epoch": 1.0559977071804934, + "grad_norm": 4.03125, + "learning_rate": 0.0001503842622542778, + "loss": 0.12705818176269532, + "mean_token_accuracy": 0.9734925562143326, + "num_tokens": 12648382.0, + "step": 11975 + }, + { + "entropy": 0.08148457858012989, + "epoch": 1.0582023413214576, + "grad_norm": 1.65625, + "learning_rate": 0.0001501785306713817, + "loss": 0.10386928558349609, + "mean_token_accuracy": 0.9757542279362679, + "num_tokens": 12674085.0, + "step": 12000 + }, + { + "epoch": 1.0582023413214576, + "eval_entropy": 0.046589221632191466, + "eval_loss": 0.05319363623857498, + "eval_mean_token_accuracy": 0.9850547621718259, + "eval_num_tokens": 12674085.0, + "eval_runtime": 245.4491, + "eval_samples_per_second": 16.007, + "eval_steps_per_second": 4.005, + "step": 12000 + }, + { + "entropy": 0.07984354873478879, + "epoch": 1.060406975462422, + "grad_norm": 1.5859375, + "learning_rate": 0.00014997251484145306, + "loss": 0.1102367115020752, + "mean_token_accuracy": 0.9725632515549659, + "num_tokens": 12700426.0, + "step": 12025 + }, + { + "entropy": 0.07065878395733308, + "epoch": 1.0626116096033864, + "grad_norm": 2.828125, + "learning_rate": 0.00014976621593151276, + "loss": 0.0880131721496582, + "mean_token_accuracy": 0.9790845555067063, + "num_tokens": 12725972.0, + "step": 12050 + }, + { + "entropy": 0.06189297338773031, + "epoch": 1.0648162437443507, + "grad_norm": 1.4375, + "learning_rate": 0.0001495596351101851, + "loss": 0.08140110969543457, + "mean_token_accuracy": 0.9800572353601456, + "num_tokens": 12751452.0, + "step": 12075 + }, + { + "entropy": 0.08883357329294085, + "epoch": 1.067020877885315, + "grad_norm": 4.21875, + "learning_rate": 0.0001493527735476914, + "loss": 0.14287374496459962, + "mean_token_accuracy": 0.9703152641654015, + "num_tokens": 12779942.0, + "step": 12100 + }, + { + "entropy": 0.11493293149920646, + "epoch": 1.0692255120262792, + "grad_norm": 4.3125, + "learning_rate": 0.00014914563241584324, + "loss": 0.14303622245788575, + "mean_token_accuracy": 0.9675487235188485, + "num_tokens": 12807288.0, + "step": 12125 + }, + { + "entropy": 0.07454075408430072, + "epoch": 1.0714301461672435, + "grad_norm": 2.125, + "learning_rate": 0.000148938212888036, + "loss": 0.0955147647857666, + "mean_token_accuracy": 0.975711221396923, + "num_tokens": 12833484.0, + "step": 12150 + }, + { + "entropy": 0.09217336674570106, + "epoch": 1.0736347803082078, + "grad_norm": 0.283203125, + "learning_rate": 0.00014873051613924194, + "loss": 0.12428475379943847, + "mean_token_accuracy": 0.9730561417341232, + "num_tokens": 12859854.0, + "step": 12175 + }, + { + "entropy": 0.07789780603197868, + "epoch": 1.075839414449172, + "grad_norm": 1.5859375, + "learning_rate": 0.0001485225433460038, + "loss": 0.10363780975341796, + "mean_token_accuracy": 0.975709747672081, + "num_tokens": 12885848.0, + "step": 12200 + }, + { + "entropy": 0.08167748826730531, + "epoch": 1.0780440485901366, + "grad_norm": 4.78125, + "learning_rate": 0.00014831429568642798, + "loss": 0.1125030517578125, + "mean_token_accuracy": 0.9740335485339164, + "num_tokens": 12913035.0, + "step": 12225 + }, + { + "entropy": 0.08957995633652899, + "epoch": 1.0802486827311009, + "grad_norm": 1.78125, + "learning_rate": 0.00014810577434017802, + "loss": 0.13156410217285155, + "mean_token_accuracy": 0.9731645616889, + "num_tokens": 12940009.0, + "step": 12250 + }, + { + "entropy": 0.0830923686773167, + "epoch": 1.0824533168720651, + "grad_norm": 4.53125, + "learning_rate": 0.00014789698048846766, + "loss": 0.10087099075317382, + "mean_token_accuracy": 0.9760751956701279, + "num_tokens": 12966012.0, + "step": 12275 + }, + { + "entropy": 0.061791106628807026, + "epoch": 1.0846579510130294, + "grad_norm": 2.625, + "learning_rate": 0.0001476879153140544, + "loss": 0.09330236434936523, + "mean_token_accuracy": 0.9777177131175995, + "num_tokens": 12991480.0, + "step": 12300 + }, + { + "entropy": 0.07935161782981595, + "epoch": 1.0868625851539937, + "grad_norm": 3.078125, + "learning_rate": 0.00014747858000123268, + "loss": 0.10592585563659668, + "mean_token_accuracy": 0.977505379319191, + "num_tokens": 13017848.0, + "step": 12325 + }, + { + "entropy": 0.08796740350895561, + "epoch": 1.089067219294958, + "grad_norm": 2.125, + "learning_rate": 0.00014726897573582725, + "loss": 0.12587616920471192, + "mean_token_accuracy": 0.9732900467514992, + "num_tokens": 13044023.0, + "step": 12350 + }, + { + "entropy": 0.07419176746087032, + "epoch": 1.0912718534359223, + "grad_norm": 0.671875, + "learning_rate": 0.00014705910370518637, + "loss": 0.09034128189086914, + "mean_token_accuracy": 0.9786489099264145, + "num_tokens": 13071137.0, + "step": 12375 + }, + { + "entropy": 0.0814774570602458, + "epoch": 1.0934764875768865, + "grad_norm": 3.40625, + "learning_rate": 0.00014684896509817503, + "loss": 0.12874791145324707, + "mean_token_accuracy": 0.9732303580641747, + "num_tokens": 13098158.0, + "step": 12400 + }, + { + "entropy": 0.0893553533218801, + "epoch": 1.0956811217178508, + "grad_norm": 2.84375, + "learning_rate": 0.0001466385611051684, + "loss": 0.12181337356567383, + "mean_token_accuracy": 0.9726255944371224, + "num_tokens": 13124123.0, + "step": 12425 + }, + { + "entropy": 0.08096294098184444, + "epoch": 1.0978857558588153, + "grad_norm": 2.765625, + "learning_rate": 0.00014642789291804495, + "loss": 0.10353910446166992, + "mean_token_accuracy": 0.9766229891777038, + "num_tokens": 13151204.0, + "step": 12450 + }, + { + "entropy": 0.06782732794832555, + "epoch": 1.1000903899997796, + "grad_norm": 0.7890625, + "learning_rate": 0.00014621696173017977, + "loss": 0.09019325256347656, + "mean_token_accuracy": 0.9796833488345146, + "num_tokens": 13177694.0, + "step": 12475 + }, + { + "entropy": 0.07130046323902206, + "epoch": 1.1022950241407439, + "grad_norm": 1.796875, + "learning_rate": 0.00014600576873643773, + "loss": 0.09772644996643066, + "mean_token_accuracy": 0.9763439601659775, + "num_tokens": 13205202.0, + "step": 12500 + }, + { + "entropy": 0.06136355609982275, + "epoch": 1.1044996582817082, + "grad_norm": 2.6875, + "learning_rate": 0.00014579431513316672, + "loss": 0.07834446430206299, + "mean_token_accuracy": 0.9810146534442902, + "num_tokens": 13230360.0, + "step": 12525 + }, + { + "entropy": 0.06879065922315931, + "epoch": 1.1067042924226724, + "grad_norm": 0.9921875, + "learning_rate": 0.00014558260211819106, + "loss": 0.11468278884887695, + "mean_token_accuracy": 0.9763934463262558, + "num_tokens": 13256900.0, + "step": 12550 + }, + { + "entropy": 0.07944254042347892, + "epoch": 1.1089089265636367, + "grad_norm": 7.125, + "learning_rate": 0.00014537063089080436, + "loss": 0.10132018089294434, + "mean_token_accuracy": 0.9765703016519547, + "num_tokens": 13283731.0, + "step": 12575 + }, + { + "entropy": 0.07877481998410076, + "epoch": 1.111113560704601, + "grad_norm": 6.0625, + "learning_rate": 0.00014515840265176308, + "loss": 0.10516364097595216, + "mean_token_accuracy": 0.9745470371842384, + "num_tokens": 13309321.0, + "step": 12600 + }, + { + "entropy": 0.0723288451100234, + "epoch": 1.1133181948455655, + "grad_norm": 1.890625, + "learning_rate": 0.00014494591860327952, + "loss": 0.09705366134643555, + "mean_token_accuracy": 0.978722071647644, + "num_tokens": 13336571.0, + "step": 12625 + }, + { + "entropy": 0.07894025734014576, + "epoch": 1.1155228289865298, + "grad_norm": 2.46875, + "learning_rate": 0.00014473317994901508, + "loss": 0.1049672794342041, + "mean_token_accuracy": 0.9736948177218437, + "num_tokens": 13361742.0, + "step": 12650 + }, + { + "entropy": 0.06260268139514664, + "epoch": 1.117727463127494, + "grad_norm": 1.09375, + "learning_rate": 0.0001445201878940734, + "loss": 0.08412216186523437, + "mean_token_accuracy": 0.9800903624296189, + "num_tokens": 13387859.0, + "step": 12675 + }, + { + "entropy": 0.0810048793273745, + "epoch": 1.1199320972684583, + "grad_norm": 4.0625, + "learning_rate": 0.00014430694364499363, + "loss": 0.13930511474609375, + "mean_token_accuracy": 0.9736354485154152, + "num_tokens": 13414615.0, + "step": 12700 + }, + { + "entropy": 0.08986480454841513, + "epoch": 1.1221367314094226, + "grad_norm": 1.984375, + "learning_rate": 0.00014409344840974345, + "loss": 0.12213245391845703, + "mean_token_accuracy": 0.9764112460613251, + "num_tokens": 13441875.0, + "step": 12725 + }, + { + "entropy": 0.06098911547655007, + "epoch": 1.1243413655503869, + "grad_norm": 2.96875, + "learning_rate": 0.00014387970339771236, + "loss": 0.08347887992858886, + "mean_token_accuracy": 0.9798440513014793, + "num_tokens": 13468080.0, + "step": 12750 + }, + { + "entropy": 0.07254652585543227, + "epoch": 1.1265459996913512, + "grad_norm": 1.6171875, + "learning_rate": 0.00014366570981970468, + "loss": 0.1049235725402832, + "mean_token_accuracy": 0.9728858286142349, + "num_tokens": 13493997.0, + "step": 12775 + }, + { + "entropy": 0.08353760754165705, + "epoch": 1.1287506338323154, + "grad_norm": 4.0, + "learning_rate": 0.000143451468887933, + "loss": 0.1097674560546875, + "mean_token_accuracy": 0.972501235306263, + "num_tokens": 13519838.0, + "step": 12800 + }, + { + "entropy": 0.08435321261407808, + "epoch": 1.1309552679732797, + "grad_norm": 4.3125, + "learning_rate": 0.00014323698181601085, + "loss": 0.13715192794799805, + "mean_token_accuracy": 0.9768632537126541, + "num_tokens": 13546798.0, + "step": 12825 + }, + { + "entropy": 0.10204421273374464, + "epoch": 1.1331599021142442, + "grad_norm": 2.984375, + "learning_rate": 0.00014302224981894616, + "loss": 0.12858396530151367, + "mean_token_accuracy": 0.9731094029545784, + "num_tokens": 13573745.0, + "step": 12850 + }, + { + "entropy": 0.06739531061204616, + "epoch": 1.1353645362552085, + "grad_norm": 2.46875, + "learning_rate": 0.0001428072741131344, + "loss": 0.096489896774292, + "mean_token_accuracy": 0.9783440440893173, + "num_tokens": 13599946.0, + "step": 12875 + }, + { + "entropy": 0.09769865162845236, + "epoch": 1.1375691703961728, + "grad_norm": 2.109375, + "learning_rate": 0.00014259205591635144, + "loss": 0.1369704532623291, + "mean_token_accuracy": 0.9713424414396286, + "num_tokens": 13627369.0, + "step": 12900 + }, + { + "entropy": 0.07299916661781027, + "epoch": 1.139773804537137, + "grad_norm": 4.75, + "learning_rate": 0.00014237659644774684, + "loss": 0.08844392776489257, + "mean_token_accuracy": 0.9792978599667549, + "num_tokens": 13653006.0, + "step": 12925 + }, + { + "entropy": 0.068821275804803, + "epoch": 1.1419784386781013, + "grad_norm": 2.296875, + "learning_rate": 0.00014216089692783694, + "loss": 0.08574678421020508, + "mean_token_accuracy": 0.9801573866605758, + "num_tokens": 13679808.0, + "step": 12950 + }, + { + "entropy": 0.057801376127754336, + "epoch": 1.1441830728190656, + "grad_norm": 4.15625, + "learning_rate": 0.00014194495857849782, + "loss": 0.09204750061035157, + "mean_token_accuracy": 0.9782607558369637, + "num_tokens": 13705484.0, + "step": 12975 + }, + { + "entropy": 0.06144713588932063, + "epoch": 1.14638770696003, + "grad_norm": 2.5, + "learning_rate": 0.00014172878262295853, + "loss": 0.0980979061126709, + "mean_token_accuracy": 0.978396021425724, + "num_tokens": 13731153.0, + "step": 13000 + }, + { + "epoch": 1.14638770696003, + "eval_entropy": 0.040426121556151766, + "eval_loss": 0.04483678936958313, + "eval_mean_token_accuracy": 0.987208288053045, + "eval_num_tokens": 13731153.0, + "eval_runtime": 248.9407, + "eval_samples_per_second": 15.783, + "eval_steps_per_second": 3.949, + "step": 13000 + }, + { + "entropy": 0.07772521212056745, + "epoch": 1.1485923411009944, + "grad_norm": 2.5, + "learning_rate": 0.00014151237028579402, + "loss": 0.09974138259887695, + "mean_token_accuracy": 0.9792518520355225, + "num_tokens": 13756932.0, + "step": 13025 + }, + { + "entropy": 0.08041290965047665, + "epoch": 1.1507969752419587, + "grad_norm": 2.796875, + "learning_rate": 0.00014129572279291837, + "loss": 0.12203351974487305, + "mean_token_accuracy": 0.9737589022517205, + "num_tokens": 13783460.0, + "step": 13050 + }, + { + "entropy": 0.08667991714755771, + "epoch": 1.153001609382923, + "grad_norm": 3.5625, + "learning_rate": 0.00014107884137157768, + "loss": 0.12619853019714355, + "mean_token_accuracy": 0.969507671892643, + "num_tokens": 13810053.0, + "step": 13075 + }, + { + "entropy": 0.07824131268818746, + "epoch": 1.1552062435238872, + "grad_norm": 1.28125, + "learning_rate": 0.00014086172725034316, + "loss": 0.08437572479248047, + "mean_token_accuracy": 0.9793237087130546, + "num_tokens": 13835469.0, + "step": 13100 + }, + { + "entropy": 0.07533937879052247, + "epoch": 1.1574108776648515, + "grad_norm": 4.6875, + "learning_rate": 0.00014064438165910432, + "loss": 0.1265271282196045, + "mean_token_accuracy": 0.9731573021411896, + "num_tokens": 13862336.0, + "step": 13125 + }, + { + "entropy": 0.07942030368372797, + "epoch": 1.1596155118058158, + "grad_norm": 3.1875, + "learning_rate": 0.00014042680582906176, + "loss": 0.1086429500579834, + "mean_token_accuracy": 0.9767365399003028, + "num_tokens": 13888059.0, + "step": 13150 + }, + { + "entropy": 0.064769710698165, + "epoch": 1.16182014594678, + "grad_norm": 0.62109375, + "learning_rate": 0.00014020900099272036, + "loss": 0.08750583648681641, + "mean_token_accuracy": 0.9795074462890625, + "num_tokens": 13912788.0, + "step": 13175 + }, + { + "entropy": 0.07411962449958082, + "epoch": 1.1640247800877443, + "grad_norm": 3.421875, + "learning_rate": 0.00013999096838388227, + "loss": 0.09150676727294922, + "mean_token_accuracy": 0.9767512452602386, + "num_tokens": 13939505.0, + "step": 13200 + }, + { + "entropy": 0.07226109138398897, + "epoch": 1.1662294142287086, + "grad_norm": 2.46875, + "learning_rate": 0.0001397727092376399, + "loss": 0.10258906364440917, + "mean_token_accuracy": 0.9773488900065422, + "num_tokens": 13967074.0, + "step": 13225 + }, + { + "entropy": 0.06648587315896293, + "epoch": 1.1684340483696731, + "grad_norm": 6.71875, + "learning_rate": 0.0001395542247903689, + "loss": 0.0932936954498291, + "mean_token_accuracy": 0.980256330370903, + "num_tokens": 13993453.0, + "step": 13250 + }, + { + "entropy": 0.07006794507964514, + "epoch": 1.1706386825106374, + "grad_norm": 2.375, + "learning_rate": 0.00013933551627972124, + "loss": 0.08853204727172852, + "mean_token_accuracy": 0.9816073548793792, + "num_tokens": 14019777.0, + "step": 13275 + }, + { + "entropy": 0.06473562117433175, + "epoch": 1.1728433166516017, + "grad_norm": 5.4375, + "learning_rate": 0.00013911658494461808, + "loss": 0.08555278778076172, + "mean_token_accuracy": 0.9794245132803917, + "num_tokens": 14046099.0, + "step": 13300 + }, + { + "entropy": 0.057029941927758045, + "epoch": 1.175047950792566, + "grad_norm": 2.15625, + "learning_rate": 0.0001388974320252429, + "loss": 0.08000053405761719, + "mean_token_accuracy": 0.9806934934854508, + "num_tokens": 14071432.0, + "step": 13325 + }, + { + "entropy": 0.059837884416338055, + "epoch": 1.1772525849335302, + "grad_norm": 2.5625, + "learning_rate": 0.00013867805876303425, + "loss": 0.09512213706970214, + "mean_token_accuracy": 0.9803010419011116, + "num_tokens": 14097921.0, + "step": 13350 + }, + { + "entropy": 0.08421937897976023, + "epoch": 1.1794572190744945, + "grad_norm": 2.015625, + "learning_rate": 0.00013845846640067906, + "loss": 0.13767002105712892, + "mean_token_accuracy": 0.9754521402716637, + "num_tokens": 14123607.0, + "step": 13375 + }, + { + "entropy": 0.07827224752778421, + "epoch": 1.1816618532154588, + "grad_norm": 2.453125, + "learning_rate": 0.00013823865618210523, + "loss": 0.09929595947265625, + "mean_token_accuracy": 0.9765499302744866, + "num_tokens": 14150885.0, + "step": 13400 + }, + { + "entropy": 0.07516255997674307, + "epoch": 1.1838664873564233, + "grad_norm": 4.8125, + "learning_rate": 0.00013801862935247484, + "loss": 0.09415432929992676, + "mean_token_accuracy": 0.9775673219561577, + "num_tokens": 14177368.0, + "step": 13425 + }, + { + "entropy": 0.08141786261985544, + "epoch": 1.1860711214973876, + "grad_norm": 2.296875, + "learning_rate": 0.00013779838715817695, + "loss": 0.11828108787536622, + "mean_token_accuracy": 0.9732054328918457, + "num_tokens": 14203895.0, + "step": 13450 + }, + { + "entropy": 0.06786765140190255, + "epoch": 1.1882757556383519, + "grad_norm": 2.0, + "learning_rate": 0.00013757793084682066, + "loss": 0.0916921329498291, + "mean_token_accuracy": 0.979742522239685, + "num_tokens": 14229737.0, + "step": 13475 + }, + { + "entropy": 0.06600234019482741, + "epoch": 1.1904803897793161, + "grad_norm": 2.5625, + "learning_rate": 0.00013735726166722799, + "loss": 0.08265236854553222, + "mean_token_accuracy": 0.9800075829029083, + "num_tokens": 14255643.0, + "step": 13500 + }, + { + "entropy": 0.06735640317740035, + "epoch": 1.1926850239202804, + "grad_norm": 4.75, + "learning_rate": 0.00013713638086942668, + "loss": 0.08291936874389648, + "mean_token_accuracy": 0.9814954048395157, + "num_tokens": 14279948.0, + "step": 13525 + }, + { + "entropy": 0.06462765499542002, + "epoch": 1.1948896580612447, + "grad_norm": 5.625, + "learning_rate": 0.00013691528970464334, + "loss": 0.09613308906555176, + "mean_token_accuracy": 0.9818469232320786, + "num_tokens": 14306310.0, + "step": 13550 + }, + { + "entropy": 0.06327619094226974, + "epoch": 1.197094292202209, + "grad_norm": 5.40625, + "learning_rate": 0.00013669398942529627, + "loss": 0.09374700546264648, + "mean_token_accuracy": 0.9792777088284492, + "num_tokens": 14331413.0, + "step": 13575 + }, + { + "entropy": 0.07611146297247615, + "epoch": 1.1992989263431733, + "grad_norm": 1.4375, + "learning_rate": 0.0001364724812849882, + "loss": 0.10114512443542481, + "mean_token_accuracy": 0.97606270134449, + "num_tokens": 14358067.0, + "step": 13600 + }, + { + "entropy": 0.06686223473065184, + "epoch": 1.2015035604841375, + "grad_norm": 3.3125, + "learning_rate": 0.00013625076653849956, + "loss": 0.10389527320861816, + "mean_token_accuracy": 0.9748416700959206, + "num_tokens": 14383570.0, + "step": 13625 + }, + { + "entropy": 0.09236453044810332, + "epoch": 1.203708194625102, + "grad_norm": 2.375, + "learning_rate": 0.00013602884644178088, + "loss": 0.11577310562133789, + "mean_token_accuracy": 0.9753095716238022, + "num_tokens": 14410596.0, + "step": 13650 + }, + { + "entropy": 0.06537643666742951, + "epoch": 1.2059128287660663, + "grad_norm": 2.265625, + "learning_rate": 0.00013580672225194614, + "loss": 0.07989202499389648, + "mean_token_accuracy": 0.9799061450362205, + "num_tokens": 14435854.0, + "step": 13675 + }, + { + "entropy": 0.055518159719649705, + "epoch": 1.2081174629070306, + "grad_norm": 0.765625, + "learning_rate": 0.00013558439522726534, + "loss": 0.07418478488922119, + "mean_token_accuracy": 0.9806128066778182, + "num_tokens": 14460482.0, + "step": 13700 + }, + { + "entropy": 0.08347531014704146, + "epoch": 1.2103220970479949, + "grad_norm": 1.140625, + "learning_rate": 0.00013536186662715756, + "loss": 0.10274332046508788, + "mean_token_accuracy": 0.9763949981331825, + "num_tokens": 14486796.0, + "step": 13725 + }, + { + "entropy": 0.07670228075119667, + "epoch": 1.2125267311889592, + "grad_norm": 1.828125, + "learning_rate": 0.0001351391377121837, + "loss": 0.1251106834411621, + "mean_token_accuracy": 0.9748952552676201, + "num_tokens": 14515063.0, + "step": 13750 + }, + { + "entropy": 0.07520388591961819, + "epoch": 1.2147313653299234, + "grad_norm": 3.296875, + "learning_rate": 0.0001349162097440394, + "loss": 0.13189183235168456, + "mean_token_accuracy": 0.9732701346278191, + "num_tokens": 14540360.0, + "step": 13775 + }, + { + "entropy": 0.06956776957900729, + "epoch": 1.2169359994708877, + "grad_norm": 7.84375, + "learning_rate": 0.00013469308398554778, + "loss": 0.09446800231933594, + "mean_token_accuracy": 0.9780878749489784, + "num_tokens": 14566124.0, + "step": 13800 + }, + { + "entropy": 0.07163769841310569, + "epoch": 1.2191406336118522, + "grad_norm": 3.53125, + "learning_rate": 0.00013446976170065263, + "loss": 0.09359949111938476, + "mean_token_accuracy": 0.9788430881500244, + "num_tokens": 14591871.0, + "step": 13825 + }, + { + "entropy": 0.07607505476626102, + "epoch": 1.2213452677528165, + "grad_norm": 1.328125, + "learning_rate": 0.00013424624415441077, + "loss": 0.10732210159301758, + "mean_token_accuracy": 0.9768683615326882, + "num_tokens": 14618746.0, + "step": 13850 + }, + { + "entropy": 0.07799635212053545, + "epoch": 1.2235499018937808, + "grad_norm": 2.296875, + "learning_rate": 0.00013402253261298524, + "loss": 0.10910382270812988, + "mean_token_accuracy": 0.9741278582811356, + "num_tokens": 14645850.0, + "step": 13875 + }, + { + "entropy": 0.0740620466673863, + "epoch": 1.225754536034745, + "grad_norm": 1.71875, + "learning_rate": 0.00013379862834363797, + "loss": 0.10109277725219727, + "mean_token_accuracy": 0.9789579641819001, + "num_tokens": 14672531.0, + "step": 13900 + }, + { + "entropy": 0.07713249627413461, + "epoch": 1.2279591701757093, + "grad_norm": 2.75, + "learning_rate": 0.00013357453261472258, + "loss": 0.10372325897216797, + "mean_token_accuracy": 0.9723598077893257, + "num_tokens": 14698635.0, + "step": 13925 + }, + { + "entropy": 0.07735523965937319, + "epoch": 1.2301638043166736, + "grad_norm": 2.09375, + "learning_rate": 0.0001333502466956774, + "loss": 0.10036856651306153, + "mean_token_accuracy": 0.9775028115510941, + "num_tokens": 14723411.0, + "step": 13950 + }, + { + "entropy": 0.07590767514484469, + "epoch": 1.2323684384576379, + "grad_norm": 2.046875, + "learning_rate": 0.00013312577185701802, + "loss": 0.09764260292053223, + "mean_token_accuracy": 0.9767592492699623, + "num_tokens": 14750384.0, + "step": 13975 + }, + { + "entropy": 0.07375224770570639, + "epoch": 1.2345730725986024, + "grad_norm": 1.3515625, + "learning_rate": 0.00013290110937033022, + "loss": 0.10770444869995117, + "mean_token_accuracy": 0.9764727628231049, + "num_tokens": 14778089.0, + "step": 14000 + }, + { + "epoch": 1.2345730725986024, + "eval_entropy": 0.03317551793803443, + "eval_loss": 0.042815063148736954, + "eval_mean_token_accuracy": 0.9874204469964075, + "eval_num_tokens": 14778089.0, + "eval_runtime": 249.1251, + "eval_samples_per_second": 15.771, + "eval_steps_per_second": 3.946, + "step": 14000 + }, + { + "entropy": 0.08082759477692888, + "epoch": 1.2367777067395667, + "grad_norm": 3.21875, + "learning_rate": 0.00013267626050826275, + "loss": 0.1340487289428711, + "mean_token_accuracy": 0.9732246950268746, + "num_tokens": 14806606.0, + "step": 14025 + }, + { + "entropy": 0.06270176315927528, + "epoch": 1.238982340880531, + "grad_norm": 1.8515625, + "learning_rate": 0.0001324512265445202, + "loss": 0.0792490816116333, + "mean_token_accuracy": 0.9802202478051185, + "num_tokens": 14832296.0, + "step": 14050 + }, + { + "entropy": 0.06344090539118043, + "epoch": 1.2411869750214952, + "grad_norm": 1.6484375, + "learning_rate": 0.0001322260087538556, + "loss": 0.07540414333343506, + "mean_token_accuracy": 0.9804856219887733, + "num_tokens": 14857948.0, + "step": 14075 + }, + { + "entropy": 0.07318885205080733, + "epoch": 1.2433916091624595, + "grad_norm": 3.0625, + "learning_rate": 0.00013200060841206336, + "loss": 0.10172588348388673, + "mean_token_accuracy": 0.9763992458581925, + "num_tokens": 14885010.0, + "step": 14100 + }, + { + "entropy": 0.09100308601686265, + "epoch": 1.2455962433034238, + "grad_norm": 0.64453125, + "learning_rate": 0.00013177502679597198, + "loss": 0.11631698608398437, + "mean_token_accuracy": 0.9720056369900704, + "num_tokens": 14911651.0, + "step": 14125 + }, + { + "entropy": 0.06297349792752356, + "epoch": 1.247800877444388, + "grad_norm": 4.34375, + "learning_rate": 0.00013154926518343685, + "loss": 0.08014249801635742, + "mean_token_accuracy": 0.9779363590478897, + "num_tokens": 14937307.0, + "step": 14150 + }, + { + "entropy": 0.06706018248645705, + "epoch": 1.2500055115853523, + "grad_norm": 3.5, + "learning_rate": 0.00013132332485333293, + "loss": 0.09858132362365722, + "mean_token_accuracy": 0.9780547374486923, + "num_tokens": 14963286.0, + "step": 14175 + }, + { + "entropy": 0.06707234902569326, + "epoch": 1.2522101457263166, + "grad_norm": 3.8125, + "learning_rate": 0.0001310972070855477, + "loss": 0.09439393043518067, + "mean_token_accuracy": 0.9792554202675819, + "num_tokens": 14988574.0, + "step": 14200 + }, + { + "entropy": 0.055138559531333155, + "epoch": 1.2544147798672811, + "grad_norm": 2.0625, + "learning_rate": 0.00013087091316097357, + "loss": 0.07935715675354003, + "mean_token_accuracy": 0.9827534911036492, + "num_tokens": 15014049.0, + "step": 14225 + }, + { + "entropy": 0.05755839720135555, + "epoch": 1.2566194140082454, + "grad_norm": 0.70703125, + "learning_rate": 0.000130644444361501, + "loss": 0.08224278450012207, + "mean_token_accuracy": 0.9831167414784432, + "num_tokens": 15040415.0, + "step": 14250 + }, + { + "entropy": 0.07995560545474291, + "epoch": 1.2588240481492097, + "grad_norm": 1.3671875, + "learning_rate": 0.00013041780197001096, + "loss": 0.10370305061340332, + "mean_token_accuracy": 0.9753643274307251, + "num_tokens": 15068204.0, + "step": 14275 + }, + { + "entropy": 0.05787755233614007, + "epoch": 1.261028682290174, + "grad_norm": 1.328125, + "learning_rate": 0.00013019098727036783, + "loss": 0.07909601211547851, + "mean_token_accuracy": 0.9826915863156319, + "num_tokens": 15094396.0, + "step": 14300 + }, + { + "entropy": 0.07108791207894682, + "epoch": 1.2632333164311382, + "grad_norm": 3.328125, + "learning_rate": 0.00012996400154741206, + "loss": 0.11207365036010743, + "mean_token_accuracy": 0.9777934941649437, + "num_tokens": 15120265.0, + "step": 14325 + }, + { + "entropy": 0.07744198189320742, + "epoch": 1.2654379505721025, + "grad_norm": 3.78125, + "learning_rate": 0.0001297368460869529, + "loss": 0.0994594669342041, + "mean_token_accuracy": 0.9777314651012421, + "num_tokens": 15147541.0, + "step": 14350 + }, + { + "entropy": 0.0713876857640571, + "epoch": 1.2676425847130668, + "grad_norm": 1.796875, + "learning_rate": 0.000129509522175761, + "loss": 0.1327120590209961, + "mean_token_accuracy": 0.9768693792819977, + "num_tokens": 15174152.0, + "step": 14375 + }, + { + "entropy": 0.07512093096884201, + "epoch": 1.2698472188540313, + "grad_norm": 3.40625, + "learning_rate": 0.00012928203110156147, + "loss": 0.0983432388305664, + "mean_token_accuracy": 0.9782489436864853, + "num_tokens": 15199844.0, + "step": 14400 + }, + { + "entropy": 0.07445235479419353, + "epoch": 1.2720518529949953, + "grad_norm": 0.53515625, + "learning_rate": 0.00012905437415302614, + "loss": 0.10376456260681152, + "mean_token_accuracy": 0.9784587541222572, + "num_tokens": 15226788.0, + "step": 14425 + }, + { + "entropy": 0.07068355955168953, + "epoch": 1.2742564871359598, + "grad_norm": 3.203125, + "learning_rate": 0.00012882655261976656, + "loss": 0.09820618629455566, + "mean_token_accuracy": 0.9778537949919701, + "num_tokens": 15252536.0, + "step": 14450 + }, + { + "entropy": 0.06282317588367732, + "epoch": 1.2764611212769241, + "grad_norm": 3.203125, + "learning_rate": 0.00012859856779232656, + "loss": 0.09604778289794921, + "mean_token_accuracy": 0.9794092005491257, + "num_tokens": 15278867.0, + "step": 14475 + }, + { + "entropy": 0.06273727558698738, + "epoch": 1.2786657554178884, + "grad_norm": 0.7421875, + "learning_rate": 0.00012837042096217513, + "loss": 0.08490729331970215, + "mean_token_accuracy": 0.9807071474194526, + "num_tokens": 15305157.0, + "step": 14500 + }, + { + "entropy": 0.07701689342531609, + "epoch": 1.2808703895588527, + "grad_norm": 1.6015625, + "learning_rate": 0.00012814211342169872, + "loss": 0.10085676193237304, + "mean_token_accuracy": 0.9763132336735726, + "num_tokens": 15332404.0, + "step": 14525 + }, + { + "entropy": 0.08465464082692051, + "epoch": 1.283075023699817, + "grad_norm": 2.328125, + "learning_rate": 0.00012791364646419436, + "loss": 0.116805419921875, + "mean_token_accuracy": 0.974503707587719, + "num_tokens": 15361644.0, + "step": 14550 + }, + { + "entropy": 0.08087968374718912, + "epoch": 1.2852796578407812, + "grad_norm": 1.4140625, + "learning_rate": 0.00012768502138386196, + "loss": 0.11668486595153808, + "mean_token_accuracy": 0.9748384010791779, + "num_tokens": 15388581.0, + "step": 14575 + }, + { + "entropy": 0.06501860655174824, + "epoch": 1.2874842919817455, + "grad_norm": 3.765625, + "learning_rate": 0.00012745623947579737, + "loss": 0.08816573143005371, + "mean_token_accuracy": 0.9800462105870247, + "num_tokens": 15414389.0, + "step": 14600 + }, + { + "entropy": 0.05114416478230851, + "epoch": 1.28968892612271, + "grad_norm": 3.09375, + "learning_rate": 0.0001272273020359846, + "loss": 0.07702481269836425, + "mean_token_accuracy": 0.9823195374011994, + "num_tokens": 15439393.0, + "step": 14625 + }, + { + "entropy": 0.06628791322320467, + "epoch": 1.2918935602636743, + "grad_norm": 1.625, + "learning_rate": 0.0001269982103612889, + "loss": 0.08344200134277344, + "mean_token_accuracy": 0.9795394757390022, + "num_tokens": 15464910.0, + "step": 14650 + }, + { + "entropy": 0.06431539795725257, + "epoch": 1.2940981944046386, + "grad_norm": 2.421875, + "learning_rate": 0.00012676896574944903, + "loss": 0.07873389720916749, + "mean_token_accuracy": 0.9806241154670715, + "num_tokens": 15490634.0, + "step": 14675 + }, + { + "entropy": 0.07032535757040023, + "epoch": 1.2963028285456029, + "grad_norm": 1.734375, + "learning_rate": 0.00012653956949907027, + "loss": 0.1325330352783203, + "mean_token_accuracy": 0.9765818390250206, + "num_tokens": 15517211.0, + "step": 14700 + }, + { + "entropy": 0.07646813986793859, + "epoch": 1.2985074626865671, + "grad_norm": 0.421875, + "learning_rate": 0.0001263100229096167, + "loss": 0.10172966003417969, + "mean_token_accuracy": 0.9765880072116851, + "num_tokens": 15543427.0, + "step": 14725 + }, + { + "entropy": 0.07018844754958992, + "epoch": 1.3007120968275314, + "grad_norm": 6.0625, + "learning_rate": 0.00012608032728140422, + "loss": 0.09067879676818848, + "mean_token_accuracy": 0.9805409485101699, + "num_tokens": 15569803.0, + "step": 14750 + }, + { + "entropy": 0.07123511103447527, + "epoch": 1.3029167309684957, + "grad_norm": 3.59375, + "learning_rate": 0.0001258504839155929, + "loss": 0.09630011558532715, + "mean_token_accuracy": 0.9766310065984726, + "num_tokens": 15597403.0, + "step": 14775 + }, + { + "entropy": 0.07258567026554374, + "epoch": 1.3051213651094602, + "grad_norm": 2.625, + "learning_rate": 0.00012562049411417965, + "loss": 0.10658187866210937, + "mean_token_accuracy": 0.9771447145938873, + "num_tokens": 15624262.0, + "step": 14800 + }, + { + "entropy": 0.07019207466975785, + "epoch": 1.3073259992504245, + "grad_norm": 3.671875, + "learning_rate": 0.00012539035917999097, + "loss": 0.09414088249206543, + "mean_token_accuracy": 0.976831995844841, + "num_tokens": 15651296.0, + "step": 14825 + }, + { + "entropy": 0.057612950848124456, + "epoch": 1.3095306333913888, + "grad_norm": 2.421875, + "learning_rate": 0.0001251600804166755, + "loss": 0.08104521751403809, + "mean_token_accuracy": 0.9797638493776322, + "num_tokens": 15676634.0, + "step": 14850 + }, + { + "entropy": 0.08526865348889259, + "epoch": 1.311735267532353, + "grad_norm": 2.546875, + "learning_rate": 0.00012492965912869658, + "loss": 0.10663084983825684, + "mean_token_accuracy": 0.9760200345516205, + "num_tokens": 15704426.0, + "step": 14875 + }, + { + "entropy": 0.07147054383560317, + "epoch": 1.3139399016733173, + "grad_norm": 2.109375, + "learning_rate": 0.00012469909662132496, + "loss": 0.09058443069458008, + "mean_token_accuracy": 0.9777011832594872, + "num_tokens": 15731370.0, + "step": 14900 + }, + { + "entropy": 0.05473189148120582, + "epoch": 1.3161445358142816, + "grad_norm": 0.6953125, + "learning_rate": 0.00012446839420063137, + "loss": 0.07786131858825683, + "mean_token_accuracy": 0.9817092132568359, + "num_tokens": 15757600.0, + "step": 14925 + }, + { + "entropy": 0.07791225100867451, + "epoch": 1.3183491699552459, + "grad_norm": 2.421875, + "learning_rate": 0.000124237553173479, + "loss": 0.14551064491271973, + "mean_token_accuracy": 0.9755642533302307, + "num_tokens": 15785308.0, + "step": 14950 + }, + { + "entropy": 0.07555221020767931, + "epoch": 1.3205538040962104, + "grad_norm": 0.84375, + "learning_rate": 0.00012400657484751634, + "loss": 0.08915863037109376, + "mean_token_accuracy": 0.9793051525950431, + "num_tokens": 15811726.0, + "step": 14975 + }, + { + "entropy": 0.06056748362811049, + "epoch": 1.3227584382371744, + "grad_norm": 1.5859375, + "learning_rate": 0.00012377546053116958, + "loss": 0.09584538459777832, + "mean_token_accuracy": 0.9806088766455651, + "num_tokens": 15837688.0, + "step": 15000 + }, + { + "epoch": 1.3227584382371744, + "eval_entropy": 0.03326694843892694, + "eval_loss": 0.04031017795205116, + "eval_mean_token_accuracy": 0.9879487514010766, + "eval_num_tokens": 15837688.0, + "eval_runtime": 248.7592, + "eval_samples_per_second": 15.794, + "eval_steps_per_second": 3.952, + "step": 15000 + }, + { + "entropy": 0.06415669579728274, + "epoch": 1.324963072378139, + "grad_norm": 2.859375, + "learning_rate": 0.0001235442115336352, + "loss": 0.08943920135498047, + "mean_token_accuracy": 0.9794209009408951, + "num_tokens": 15863738.0, + "step": 15025 + }, + { + "entropy": 0.08142153425927973, + "epoch": 1.3271677065191032, + "grad_norm": 2.140625, + "learning_rate": 0.0001233128291648727, + "loss": 0.11874267578125, + "mean_token_accuracy": 0.9757076546549797, + "num_tokens": 15890894.0, + "step": 15050 + }, + { + "entropy": 0.0783909736975329, + "epoch": 1.3293723406600675, + "grad_norm": 2.0625, + "learning_rate": 0.0001230813147355971, + "loss": 0.11000186920166016, + "mean_token_accuracy": 0.9764264470338821, + "num_tokens": 15918938.0, + "step": 15075 + }, + { + "entropy": 0.06577202208303788, + "epoch": 1.3315769748010318, + "grad_norm": 0.9453125, + "learning_rate": 0.0001228496695572714, + "loss": 0.10996245384216309, + "mean_token_accuracy": 0.9817772251367569, + "num_tokens": 15946088.0, + "step": 15100 + }, + { + "entropy": 0.06237481433723588, + "epoch": 1.333781608941996, + "grad_norm": 0.8125, + "learning_rate": 0.00012261789494209937, + "loss": 0.08556965827941894, + "mean_token_accuracy": 0.977502366900444, + "num_tokens": 15971944.0, + "step": 15125 + }, + { + "entropy": 0.06582288480800344, + "epoch": 1.3359862430829603, + "grad_norm": 2.5625, + "learning_rate": 0.00012238599220301788, + "loss": 0.09093325614929199, + "mean_token_accuracy": 0.9768818697333336, + "num_tokens": 15997233.0, + "step": 15150 + }, + { + "entropy": 0.065389038942958, + "epoch": 1.3381908772239246, + "grad_norm": 0.80078125, + "learning_rate": 0.00012215396265368973, + "loss": 0.09562211036682129, + "mean_token_accuracy": 0.9802220293879509, + "num_tokens": 16024740.0, + "step": 15175 + }, + { + "entropy": 0.08525684173408081, + "epoch": 1.340395511364889, + "grad_norm": 2.921875, + "learning_rate": 0.00012192180760849595, + "loss": 0.1441513729095459, + "mean_token_accuracy": 0.970992026925087, + "num_tokens": 16051698.0, + "step": 15200 + }, + { + "entropy": 0.06295717555127339, + "epoch": 1.3426001455058534, + "grad_norm": 4.5625, + "learning_rate": 0.00012168952838252853, + "loss": 0.0884068489074707, + "mean_token_accuracy": 0.9819012552499771, + "num_tokens": 16077703.0, + "step": 15225 + }, + { + "entropy": 0.060524827540357366, + "epoch": 1.3448047796468177, + "grad_norm": 2.5, + "learning_rate": 0.00012145712629158286, + "loss": 0.08740668296813965, + "mean_token_accuracy": 0.9805832356214523, + "num_tokens": 16104630.0, + "step": 15250 + }, + { + "entropy": 0.07011599280362134, + "epoch": 1.347009413787782, + "grad_norm": 2.203125, + "learning_rate": 0.00012122460265215038, + "loss": 0.10101802825927735, + "mean_token_accuracy": 0.9771647998690605, + "num_tokens": 16130521.0, + "step": 15275 + }, + { + "entropy": 0.05694206524029141, + "epoch": 1.3492140479287462, + "grad_norm": 2.640625, + "learning_rate": 0.000120991958781411, + "loss": 0.07328418731689453, + "mean_token_accuracy": 0.9819918230175972, + "num_tokens": 16155447.0, + "step": 15300 + }, + { + "entropy": 0.05970909093797672, + "epoch": 1.3514186820697105, + "grad_norm": 2.96875, + "learning_rate": 0.00012075919599722583, + "loss": 0.08711193084716796, + "mean_token_accuracy": 0.9814882269501686, + "num_tokens": 16181215.0, + "step": 15325 + }, + { + "entropy": 0.08422257184254704, + "epoch": 1.3536233162106748, + "grad_norm": 0.48828125, + "learning_rate": 0.00012052631561812941, + "loss": 0.1124593448638916, + "mean_token_accuracy": 0.9755303618311882, + "num_tokens": 16208701.0, + "step": 15350 + }, + { + "entropy": 0.06997806068888167, + "epoch": 1.3558279503516393, + "grad_norm": 0.7578125, + "learning_rate": 0.00012029331896332259, + "loss": 0.08874547958374024, + "mean_token_accuracy": 0.9804951578378678, + "num_tokens": 16235817.0, + "step": 15375 + }, + { + "entropy": 0.06830648218063289, + "epoch": 1.3580325844926033, + "grad_norm": 0.84765625, + "learning_rate": 0.00012006020735266474, + "loss": 0.1033985424041748, + "mean_token_accuracy": 0.9769791024923324, + "num_tokens": 16262506.0, + "step": 15400 + }, + { + "entropy": 0.05717379552341299, + "epoch": 1.3602372186335678, + "grad_norm": 3.53125, + "learning_rate": 0.00011982698210666657, + "loss": 0.08480052947998047, + "mean_token_accuracy": 0.9808929657936096, + "num_tokens": 16288669.0, + "step": 15425 + }, + { + "entropy": 0.07715147120325128, + "epoch": 1.3624418527745321, + "grad_norm": 2.171875, + "learning_rate": 0.00011959364454648238, + "loss": 0.11012930870056152, + "mean_token_accuracy": 0.9763377743959427, + "num_tokens": 16314669.0, + "step": 15450 + }, + { + "entropy": 0.06519403403050092, + "epoch": 1.3646464869154964, + "grad_norm": 2.21875, + "learning_rate": 0.0001193601959939028, + "loss": 0.07466179847717286, + "mean_token_accuracy": 0.9815621060132981, + "num_tokens": 16340364.0, + "step": 15475 + }, + { + "entropy": 0.06409725191537291, + "epoch": 1.3668511210564607, + "grad_norm": 2.46875, + "learning_rate": 0.00011912663777134707, + "loss": 0.10391844749450684, + "mean_token_accuracy": 0.9776857647299767, + "num_tokens": 16367170.0, + "step": 15500 + }, + { + "entropy": 0.06850099113042234, + "epoch": 1.369055755197425, + "grad_norm": 1.578125, + "learning_rate": 0.00011889297120185585, + "loss": 0.10006083488464355, + "mean_token_accuracy": 0.9765678381919861, + "num_tokens": 16393766.0, + "step": 15525 + }, + { + "entropy": 0.06729476221749792, + "epoch": 1.3712603893383892, + "grad_norm": 0.75390625, + "learning_rate": 0.0001186591976090834, + "loss": 0.0871645450592041, + "mean_token_accuracy": 0.9782984137535096, + "num_tokens": 16420859.0, + "step": 15550 + }, + { + "entropy": 0.06440466578496853, + "epoch": 1.3734650234793535, + "grad_norm": 2.734375, + "learning_rate": 0.00011842531831729031, + "loss": 0.07289909362792969, + "mean_token_accuracy": 0.9761216223239899, + "num_tokens": 16446598.0, + "step": 15575 + }, + { + "entropy": 0.052396752069835205, + "epoch": 1.375669657620318, + "grad_norm": 1.578125, + "learning_rate": 0.00011819133465133592, + "loss": 0.07493824005126953, + "mean_token_accuracy": 0.9828401359915734, + "num_tokens": 16473781.0, + "step": 15600 + }, + { + "entropy": 0.05889642575762991, + "epoch": 1.3778742917612823, + "grad_norm": 2.796875, + "learning_rate": 0.0001179572479366708, + "loss": 0.08633296966552734, + "mean_token_accuracy": 0.9812161940336227, + "num_tokens": 16500150.0, + "step": 15625 + }, + { + "entropy": 0.07454636727910838, + "epoch": 1.3800789259022466, + "grad_norm": 1.8359375, + "learning_rate": 0.00011772305949932928, + "loss": 0.1060667610168457, + "mean_token_accuracy": 0.9713787686824799, + "num_tokens": 16527605.0, + "step": 15650 + }, + { + "entropy": 0.05934964165790006, + "epoch": 1.3822835600432108, + "grad_norm": 1.71875, + "learning_rate": 0.00011748877066592192, + "loss": 0.08717299461364746, + "mean_token_accuracy": 0.9823898765444755, + "num_tokens": 16554331.0, + "step": 15675 + }, + { + "entropy": 0.06696738389495295, + "epoch": 1.3844881941841751, + "grad_norm": 0.6796875, + "learning_rate": 0.00011725438276362799, + "loss": 0.09112286567687988, + "mean_token_accuracy": 0.9790424865484237, + "num_tokens": 16580564.0, + "step": 15700 + }, + { + "entropy": 0.06791757563012651, + "epoch": 1.3866928283251394, + "grad_norm": 1.4140625, + "learning_rate": 0.00011701989712018798, + "loss": 0.09548748970031738, + "mean_token_accuracy": 0.9753042080998421, + "num_tokens": 16606721.0, + "step": 15725 + }, + { + "entropy": 0.060772258413926465, + "epoch": 1.3888974624661037, + "grad_norm": 2.140625, + "learning_rate": 0.00011678531506389594, + "loss": 0.08734206199645995, + "mean_token_accuracy": 0.9813879826664924, + "num_tokens": 16632865.0, + "step": 15750 + }, + { + "entropy": 0.08063680976076285, + "epoch": 1.3911020966070682, + "grad_norm": 1.359375, + "learning_rate": 0.00011655063792359226, + "loss": 0.11765849113464355, + "mean_token_accuracy": 0.9731235000491142, + "num_tokens": 16660928.0, + "step": 15775 + }, + { + "entropy": 0.06548881910362979, + "epoch": 1.3933067307480322, + "grad_norm": 3.515625, + "learning_rate": 0.00011631586702865582, + "loss": 0.08853742599487305, + "mean_token_accuracy": 0.982549340724945, + "num_tokens": 16686061.0, + "step": 15800 + }, + { + "entropy": 0.05701154603782925, + "epoch": 1.3955113648889967, + "grad_norm": 3.015625, + "learning_rate": 0.00011608100370899664, + "loss": 0.08760712623596191, + "mean_token_accuracy": 0.9825446775555611, + "num_tokens": 16712645.0, + "step": 15825 + }, + { + "entropy": 0.07111379386769841, + "epoch": 1.397715999029961, + "grad_norm": 3.171875, + "learning_rate": 0.00011584604929504823, + "loss": 0.10779889106750488, + "mean_token_accuracy": 0.9773144674301147, + "num_tokens": 16739270.0, + "step": 15850 + }, + { + "entropy": 0.06202032042929204, + "epoch": 1.3999206331709253, + "grad_norm": 1.1796875, + "learning_rate": 0.00011561100511776026, + "loss": 0.06711362838745118, + "mean_token_accuracy": 0.9842124137282372, + "num_tokens": 16766150.0, + "step": 15875 + }, + { + "entropy": 0.05521934061674983, + "epoch": 1.4021252673118896, + "grad_norm": 1.65625, + "learning_rate": 0.00011537587250859081, + "loss": 0.07264583587646484, + "mean_token_accuracy": 0.9817525637149811, + "num_tokens": 16791967.0, + "step": 15900 + }, + { + "entropy": 0.06004962370701833, + "epoch": 1.4043299014528539, + "grad_norm": 2.40625, + "learning_rate": 0.00011514065279949882, + "loss": 0.09220392227172852, + "mean_token_accuracy": 0.979226841032505, + "num_tokens": 16818341.0, + "step": 15925 + }, + { + "entropy": 0.06723880005811225, + "epoch": 1.4065345355938181, + "grad_norm": 2.03125, + "learning_rate": 0.00011490534732293677, + "loss": 0.0945945930480957, + "mean_token_accuracy": 0.9777571973204613, + "num_tokens": 16844714.0, + "step": 15950 + }, + { + "entropy": 0.06054009918632801, + "epoch": 1.4087391697347824, + "grad_norm": 1.859375, + "learning_rate": 0.00011466995741184288, + "loss": 0.08592336654663085, + "mean_token_accuracy": 0.9799673467874527, + "num_tokens": 16870803.0, + "step": 15975 + }, + { + "entropy": 0.061580221586773405, + "epoch": 1.410943803875747, + "grad_norm": 0.83984375, + "learning_rate": 0.00011443448439963374, + "loss": 0.07537917137145995, + "mean_token_accuracy": 0.9815564286708832, + "num_tokens": 16896056.0, + "step": 16000 + }, + { + "epoch": 1.410943803875747, + "eval_entropy": 0.03029697120738228, + "eval_loss": 0.035930126905441284, + "eval_mean_token_accuracy": 0.9891901263627977, + "eval_num_tokens": 16896056.0, + "eval_runtime": 248.7891, + "eval_samples_per_second": 15.792, + "eval_steps_per_second": 3.951, + "step": 16000 + }, + { + "entropy": 0.07580967456306098, + "epoch": 1.4131484380167112, + "grad_norm": 3.484375, + "learning_rate": 0.00011419892962019665, + "loss": 0.11364545822143554, + "mean_token_accuracy": 0.9733854481577873, + "num_tokens": 16924947.0, + "step": 16025 + }, + { + "entropy": 0.05546699926489964, + "epoch": 1.4153530721576755, + "grad_norm": 1.3125, + "learning_rate": 0.00011396329440788207, + "loss": 0.08760024070739746, + "mean_token_accuracy": 0.981015156507492, + "num_tokens": 16951077.0, + "step": 16050 + }, + { + "entropy": 0.08654316918458789, + "epoch": 1.4175577062986398, + "grad_norm": 2.296875, + "learning_rate": 0.00011372758009749615, + "loss": 0.1186264705657959, + "mean_token_accuracy": 0.974712208211422, + "num_tokens": 16979577.0, + "step": 16075 + }, + { + "entropy": 0.053080160124518445, + "epoch": 1.419762340439604, + "grad_norm": 0.96875, + "learning_rate": 0.00011349178802429308, + "loss": 0.07206877708435058, + "mean_token_accuracy": 0.9819573852419853, + "num_tokens": 17005306.0, + "step": 16100 + }, + { + "entropy": 0.06058032729837578, + "epoch": 1.4219669745805683, + "grad_norm": 0.93359375, + "learning_rate": 0.00011325591952396755, + "loss": 0.08690043449401856, + "mean_token_accuracy": 0.980279144346714, + "num_tokens": 17030374.0, + "step": 16125 + }, + { + "entropy": 0.06873179055764922, + "epoch": 1.4241716087215326, + "grad_norm": 2.4375, + "learning_rate": 0.00011301997593264717, + "loss": 0.1052779483795166, + "mean_token_accuracy": 0.9760980147123337, + "num_tokens": 17058470.0, + "step": 16150 + }, + { + "entropy": 0.06835755564185092, + "epoch": 1.426376242862497, + "grad_norm": 3.25, + "learning_rate": 0.00011278395858688493, + "loss": 0.09940330505371094, + "mean_token_accuracy": 0.9808136349916459, + "num_tokens": 17084609.0, + "step": 16175 + }, + { + "entropy": 0.05775143167622446, + "epoch": 1.4285808770034611, + "grad_norm": 1.21875, + "learning_rate": 0.00011254786882365169, + "loss": 0.07310149669647217, + "mean_token_accuracy": 0.9825145149230957, + "num_tokens": 17110272.0, + "step": 16200 + }, + { + "entropy": 0.05565455894873594, + "epoch": 1.4307855111444256, + "grad_norm": 1.4609375, + "learning_rate": 0.00011231170798032839, + "loss": 0.07367197513580322, + "mean_token_accuracy": 0.9814733082056045, + "num_tokens": 17137287.0, + "step": 16225 + }, + { + "entropy": 0.06046327952935826, + "epoch": 1.43299014528539, + "grad_norm": 1.1015625, + "learning_rate": 0.00011207547739469882, + "loss": 0.08034516334533691, + "mean_token_accuracy": 0.980512827038765, + "num_tokens": 17164148.0, + "step": 16250 + }, + { + "entropy": 0.07160910110978876, + "epoch": 1.4351947794263542, + "grad_norm": 2.578125, + "learning_rate": 0.00011183917840494156, + "loss": 0.09737605094909668, + "mean_token_accuracy": 0.9788093277812004, + "num_tokens": 17190982.0, + "step": 16275 + }, + { + "entropy": 0.06516157711521373, + "epoch": 1.4373994135673185, + "grad_norm": 2.40625, + "learning_rate": 0.00011160281234962296, + "loss": 0.0906210994720459, + "mean_token_accuracy": 0.979938026368618, + "num_tokens": 17217765.0, + "step": 16300 + }, + { + "entropy": 0.06518849867788959, + "epoch": 1.4396040477082828, + "grad_norm": 3.15625, + "learning_rate": 0.00011136638056768909, + "loss": 0.09936015129089355, + "mean_token_accuracy": 0.981353671848774, + "num_tokens": 17244375.0, + "step": 16325 + }, + { + "entropy": 0.06567524242636864, + "epoch": 1.441808681849247, + "grad_norm": 2.796875, + "learning_rate": 0.00011112988439845847, + "loss": 0.08795083999633789, + "mean_token_accuracy": 0.9803341096639633, + "num_tokens": 17271304.0, + "step": 16350 + }, + { + "entropy": 0.06841813209190149, + "epoch": 1.4440133159902113, + "grad_norm": 2.328125, + "learning_rate": 0.00011089332518161424, + "loss": 0.09773797035217285, + "mean_token_accuracy": 0.9775998306274414, + "num_tokens": 17298464.0, + "step": 16375 + }, + { + "entropy": 0.049831424697695186, + "epoch": 1.4462179501311758, + "grad_norm": 2.109375, + "learning_rate": 0.00011065670425719677, + "loss": 0.06948981761932373, + "mean_token_accuracy": 0.9838525611162185, + "num_tokens": 17324374.0, + "step": 16400 + }, + { + "entropy": 0.046832082935143265, + "epoch": 1.44842258427214, + "grad_norm": 0.94140625, + "learning_rate": 0.00011042002296559593, + "loss": 0.07201289176940918, + "mean_token_accuracy": 0.9847058826684951, + "num_tokens": 17350259.0, + "step": 16425 + }, + { + "entropy": 0.07186193967063445, + "epoch": 1.4506272184131044, + "grad_norm": 3.09375, + "learning_rate": 0.00011018328264754363, + "loss": 0.09589914321899413, + "mean_token_accuracy": 0.9776010760664939, + "num_tokens": 17377041.0, + "step": 16450 + }, + { + "entropy": 0.05535520975492545, + "epoch": 1.4528318525540687, + "grad_norm": 2.6875, + "learning_rate": 0.00010994648464410606, + "loss": 0.06740634441375733, + "mean_token_accuracy": 0.9842914417386055, + "num_tokens": 17402825.0, + "step": 16475 + }, + { + "entropy": 0.05898763036486344, + "epoch": 1.455036486695033, + "grad_norm": 2.875, + "learning_rate": 0.00010970963029667625, + "loss": 0.08764254570007324, + "mean_token_accuracy": 0.9801486966013908, + "num_tokens": 17430285.0, + "step": 16500 + }, + { + "entropy": 0.062050230994209414, + "epoch": 1.4572411208359972, + "grad_norm": 4.34375, + "learning_rate": 0.00010947272094696632, + "loss": 0.0902223014831543, + "mean_token_accuracy": 0.9799307104945183, + "num_tokens": 17455617.0, + "step": 16525 + }, + { + "entropy": 0.06111173116034479, + "epoch": 1.4594457549769615, + "grad_norm": 5.21875, + "learning_rate": 0.00010923575793700008, + "loss": 0.0825214958190918, + "mean_token_accuracy": 0.9789560279250145, + "num_tokens": 17481328.0, + "step": 16550 + }, + { + "entropy": 0.0673610715954419, + "epoch": 1.461650389117926, + "grad_norm": 2.484375, + "learning_rate": 0.00010899874260910517, + "loss": 0.09007739067077637, + "mean_token_accuracy": 0.9788965311646461, + "num_tokens": 17508388.0, + "step": 16575 + }, + { + "entropy": 0.06072841239583795, + "epoch": 1.46385502325889, + "grad_norm": 1.625, + "learning_rate": 0.00010876167630590577, + "loss": 0.08078091621398925, + "mean_token_accuracy": 0.9796930846571922, + "num_tokens": 17535426.0, + "step": 16600 + }, + { + "entropy": 0.0715254348909366, + "epoch": 1.4660596573998546, + "grad_norm": 0.7578125, + "learning_rate": 0.00010852456037031462, + "loss": 0.09795802116394042, + "mean_token_accuracy": 0.9788025477528572, + "num_tokens": 17562233.0, + "step": 16625 + }, + { + "entropy": 0.07394725729187485, + "epoch": 1.4682642915408188, + "grad_norm": 2.0, + "learning_rate": 0.00010828739614552577, + "loss": 0.10762414932250977, + "mean_token_accuracy": 0.9766439932584763, + "num_tokens": 17589739.0, + "step": 16650 + }, + { + "entropy": 0.057726316754124124, + "epoch": 1.4704689256817831, + "grad_norm": 1.2421875, + "learning_rate": 0.00010805018497500674, + "loss": 0.07878723621368408, + "mean_token_accuracy": 0.9822485408186913, + "num_tokens": 17615710.0, + "step": 16675 + }, + { + "entropy": 0.050235752706357745, + "epoch": 1.4726735598227474, + "grad_norm": 2.6875, + "learning_rate": 0.000107812928202491, + "loss": 0.06715853214263916, + "mean_token_accuracy": 0.9836687427759171, + "num_tokens": 17641492.0, + "step": 16700 + }, + { + "entropy": 0.060145079759095096, + "epoch": 1.4748781939637117, + "grad_norm": 4.40625, + "learning_rate": 0.00010757562717197039, + "loss": 0.102103910446167, + "mean_token_accuracy": 0.9777582693099975, + "num_tokens": 17668109.0, + "step": 16725 + }, + { + "entropy": 0.0603743210065295, + "epoch": 1.477082828104676, + "grad_norm": 3.671875, + "learning_rate": 0.00010733828322768738, + "loss": 0.079124174118042, + "mean_token_accuracy": 0.9801980945467949, + "num_tokens": 17694064.0, + "step": 16750 + }, + { + "entropy": 0.0633892720109725, + "epoch": 1.4792874622456402, + "grad_norm": 2.03125, + "learning_rate": 0.00010710089771412752, + "loss": 0.08133198738098145, + "mean_token_accuracy": 0.9809553810954094, + "num_tokens": 17720061.0, + "step": 16775 + }, + { + "entropy": 0.06459446613931505, + "epoch": 1.4814920963866047, + "grad_norm": 2.640625, + "learning_rate": 0.00010686347197601197, + "loss": 0.08791131019592285, + "mean_token_accuracy": 0.9798620289564133, + "num_tokens": 17747327.0, + "step": 16800 + }, + { + "entropy": 0.05424744870688301, + "epoch": 1.483696730527569, + "grad_norm": 1.0390625, + "learning_rate": 0.00010662600735828963, + "loss": 0.08159908294677734, + "mean_token_accuracy": 0.9822318425774574, + "num_tokens": 17773709.0, + "step": 16825 + }, + { + "entropy": 0.05502179280680139, + "epoch": 1.4859013646685333, + "grad_norm": 1.125, + "learning_rate": 0.00010638850520612967, + "loss": 0.06816422939300537, + "mean_token_accuracy": 0.9826434323191643, + "num_tokens": 17798111.0, + "step": 16850 + }, + { + "entropy": 0.05482125427632127, + "epoch": 1.4881059988094976, + "grad_norm": 0.9921875, + "learning_rate": 0.00010615096686491387, + "loss": 0.07486906051635742, + "mean_token_accuracy": 0.9844721156358719, + "num_tokens": 17823621.0, + "step": 16875 + }, + { + "entropy": 0.06038864826390636, + "epoch": 1.4903106329504618, + "grad_norm": 0.50390625, + "learning_rate": 0.0001059133936802291, + "loss": 0.06850027084350586, + "mean_token_accuracy": 0.9790200263261795, + "num_tokens": 17851059.0, + "step": 16900 + }, + { + "entropy": 0.05871053482478601, + "epoch": 1.4925152670914261, + "grad_norm": 2.46875, + "learning_rate": 0.00010567578699785953, + "loss": 0.0959819221496582, + "mean_token_accuracy": 0.9798403069376945, + "num_tokens": 17877902.0, + "step": 16925 + }, + { + "entropy": 0.055686095813143766, + "epoch": 1.4947199012323904, + "grad_norm": 0.59765625, + "learning_rate": 0.00010543814816377902, + "loss": 0.07089345932006835, + "mean_token_accuracy": 0.9839478823542595, + "num_tokens": 17904792.0, + "step": 16950 + }, + { + "entropy": 0.05614824989432236, + "epoch": 1.496924535373355, + "grad_norm": 1.5234375, + "learning_rate": 0.00010520047852414371, + "loss": 0.07994057178497314, + "mean_token_accuracy": 0.9789690652489662, + "num_tokens": 17930852.0, + "step": 16975 + }, + { + "entropy": 0.05661927604916855, + "epoch": 1.4991291695143192, + "grad_norm": 0.89453125, + "learning_rate": 0.00010496277942528412, + "loss": 0.0782936954498291, + "mean_token_accuracy": 0.9812395250797272, + "num_tokens": 17958147.0, + "step": 17000 + }, + { + "epoch": 1.4991291695143192, + "eval_entropy": 0.028475278459578403, + "eval_loss": 0.032593537122011185, + "eval_mean_token_accuracy": 0.989732926459424, + "eval_num_tokens": 17958147.0, + "eval_runtime": 248.1012, + "eval_samples_per_second": 15.836, + "eval_steps_per_second": 3.962, + "step": 17000 + }, + { + "entropy": 0.056141581527772356, + "epoch": 1.5013338036552835, + "grad_norm": 4.15625, + "learning_rate": 0.00010472505221369773, + "loss": 0.06809127807617188, + "mean_token_accuracy": 0.9798111498355866, + "num_tokens": 17984383.0, + "step": 17025 + }, + { + "entropy": 0.06520437844737899, + "epoch": 1.5035384377962477, + "grad_norm": 2.734375, + "learning_rate": 0.00010448729823604124, + "loss": 0.10084832191467286, + "mean_token_accuracy": 0.980366622209549, + "num_tokens": 18012325.0, + "step": 17050 + }, + { + "entropy": 0.06314173155333265, + "epoch": 1.505743071937212, + "grad_norm": 3.140625, + "learning_rate": 0.00010424951883912295, + "loss": 0.07563630580902099, + "mean_token_accuracy": 0.9817772355675697, + "num_tokens": 18039522.0, + "step": 17075 + }, + { + "entropy": 0.047563048835654625, + "epoch": 1.5079477060781763, + "grad_norm": 1.3046875, + "learning_rate": 0.00010401171536989517, + "loss": 0.05595874309539795, + "mean_token_accuracy": 0.9854984751343727, + "num_tokens": 18064974.0, + "step": 17100 + }, + { + "entropy": 0.053548066731600556, + "epoch": 1.5101523402191406, + "grad_norm": 1.4296875, + "learning_rate": 0.0001037738891754466, + "loss": 0.07935151100158691, + "mean_token_accuracy": 0.9796605777740478, + "num_tokens": 18091851.0, + "step": 17125 + }, + { + "entropy": 0.05894872721350111, + "epoch": 1.512356974360105, + "grad_norm": 0.8984375, + "learning_rate": 0.00010353604160299464, + "loss": 0.08113452911376953, + "mean_token_accuracy": 0.982445887029171, + "num_tokens": 18118947.0, + "step": 17150 + }, + { + "entropy": 0.05036383649487106, + "epoch": 1.5145616085010691, + "grad_norm": 0.68359375, + "learning_rate": 0.0001032981739998778, + "loss": 0.05923966407775879, + "mean_token_accuracy": 0.9860278350114823, + "num_tokens": 18143717.0, + "step": 17175 + }, + { + "entropy": 0.046229548780756885, + "epoch": 1.5167662426420336, + "grad_norm": 0.373046875, + "learning_rate": 0.000103060287713548, + "loss": 0.07923832416534424, + "mean_token_accuracy": 0.9831987258791923, + "num_tokens": 18168349.0, + "step": 17200 + }, + { + "entropy": 0.06594804828648193, + "epoch": 1.518970876782998, + "grad_norm": 3.9375, + "learning_rate": 0.00010282238409156315, + "loss": 0.11481256484985351, + "mean_token_accuracy": 0.9762162980437279, + "num_tokens": 18194696.0, + "step": 17225 + }, + { + "entropy": 0.06344727827017778, + "epoch": 1.5211755109239622, + "grad_norm": 0.28125, + "learning_rate": 0.00010258446448157917, + "loss": 0.08004162788391113, + "mean_token_accuracy": 0.9845002761483193, + "num_tokens": 18222060.0, + "step": 17250 + }, + { + "entropy": 0.07739625903428532, + "epoch": 1.5233801450649265, + "grad_norm": 0.9375, + "learning_rate": 0.00010234653023134276, + "loss": 0.11823193550109863, + "mean_token_accuracy": 0.9756862100958824, + "num_tokens": 18250159.0, + "step": 17275 + }, + { + "entropy": 0.048326283237038296, + "epoch": 1.5255847792058908, + "grad_norm": 2.765625, + "learning_rate": 0.00010210858268868328, + "loss": 0.05785459041595459, + "mean_token_accuracy": 0.9836695104837417, + "num_tokens": 18276170.0, + "step": 17300 + }, + { + "entropy": 0.055761023027298504, + "epoch": 1.5277894133468553, + "grad_norm": 2.3125, + "learning_rate": 0.00010187062320150564, + "loss": 0.07718227386474609, + "mean_token_accuracy": 0.9812486064434052, + "num_tokens": 18302079.0, + "step": 17325 + }, + { + "entropy": 0.06133182929028408, + "epoch": 1.5299940474878193, + "grad_norm": 1.6484375, + "learning_rate": 0.00010163265311778227, + "loss": 0.07602161884307862, + "mean_token_accuracy": 0.9805529493093491, + "num_tokens": 18328582.0, + "step": 17350 + }, + { + "entropy": 0.04131406945714843, + "epoch": 1.5321986816287838, + "grad_norm": 1.421875, + "learning_rate": 0.00010139467378554572, + "loss": 0.059069275856018066, + "mean_token_accuracy": 0.9861931943893433, + "num_tokens": 18354126.0, + "step": 17375 + }, + { + "entropy": 0.05679673735721735, + "epoch": 1.5344033157697479, + "grad_norm": 1.953125, + "learning_rate": 0.00010115668655288086, + "loss": 0.08031403541564941, + "mean_token_accuracy": 0.9827021709084511, + "num_tokens": 18380922.0, + "step": 17400 + }, + { + "entropy": 0.050058743792469614, + "epoch": 1.5366079499107124, + "grad_norm": 1.3984375, + "learning_rate": 0.0001009186927679173, + "loss": 0.0634553337097168, + "mean_token_accuracy": 0.9828399559855461, + "num_tokens": 18406786.0, + "step": 17425 + }, + { + "entropy": 0.06921417500561802, + "epoch": 1.5388125840516766, + "grad_norm": 0.984375, + "learning_rate": 0.0001006806937788218, + "loss": 0.10088022232055664, + "mean_token_accuracy": 0.9784321439266205, + "num_tokens": 18434199.0, + "step": 17450 + }, + { + "entropy": 0.05730687389936065, + "epoch": 1.541017218192641, + "grad_norm": 0.96875, + "learning_rate": 0.00010044269093379066, + "loss": 0.07943611145019532, + "mean_token_accuracy": 0.9813975363969802, + "num_tokens": 18460028.0, + "step": 17475 + }, + { + "entropy": 0.0577721811196534, + "epoch": 1.5432218523336052, + "grad_norm": 2.71875, + "learning_rate": 0.00010020468558104192, + "loss": 0.07405065059661865, + "mean_token_accuracy": 0.9820497670769691, + "num_tokens": 18484935.0, + "step": 17500 + }, + { + "entropy": 0.06218616932768782, + "epoch": 1.5454264864745695, + "grad_norm": 2.15625, + "learning_rate": 9.996667906880787e-05, + "loss": 0.07520933151245117, + "mean_token_accuracy": 0.9795030042529106, + "num_tokens": 18509996.0, + "step": 17525 + }, + { + "entropy": 0.05698599866198492, + "epoch": 1.547631120615534, + "grad_norm": 0.7890625, + "learning_rate": 9.972867274532739e-05, + "loss": 0.0772179651260376, + "mean_token_accuracy": 0.9817389845848083, + "num_tokens": 18536971.0, + "step": 17550 + }, + { + "entropy": 0.05628544877305103, + "epoch": 1.549835754756498, + "grad_norm": 2.9375, + "learning_rate": 9.94906679588382e-05, + "loss": 0.07337986946105957, + "mean_token_accuracy": 0.9811558586359024, + "num_tokens": 18562553.0, + "step": 17575 + }, + { + "entropy": 0.05942114258214133, + "epoch": 1.5520403888974625, + "grad_norm": 1.3984375, + "learning_rate": 9.925266605756944e-05, + "loss": 0.07937191486358643, + "mean_token_accuracy": 0.9792503651976585, + "num_tokens": 18588415.0, + "step": 17600 + }, + { + "entropy": 0.05903674538480118, + "epoch": 1.5542450230384268, + "grad_norm": 2.796875, + "learning_rate": 9.901466838973386e-05, + "loss": 0.0790792989730835, + "mean_token_accuracy": 0.9809808561205864, + "num_tokens": 18615439.0, + "step": 17625 + }, + { + "entropy": 0.047130853843918886, + "epoch": 1.556449657179391, + "grad_norm": 0.609375, + "learning_rate": 9.87766763035202e-05, + "loss": 0.058544158935546875, + "mean_token_accuracy": 0.9855070438981056, + "num_tokens": 18641119.0, + "step": 17650 + }, + { + "entropy": 0.05112945525848772, + "epoch": 1.5586542913203554, + "grad_norm": 1.4921875, + "learning_rate": 9.853869114708556e-05, + "loss": 0.0614125919342041, + "mean_token_accuracy": 0.9811950054764748, + "num_tokens": 18666641.0, + "step": 17675 + }, + { + "entropy": 0.05161150803469354, + "epoch": 1.5608589254613197, + "grad_norm": 3.9375, + "learning_rate": 9.830071426854784e-05, + "loss": 0.06254351615905762, + "mean_token_accuracy": 0.9838355273008347, + "num_tokens": 18692161.0, + "step": 17700 + }, + { + "entropy": 0.058606664875915156, + "epoch": 1.5630635596022842, + "grad_norm": 1.171875, + "learning_rate": 9.806274701597806e-05, + "loss": 0.08459006309509277, + "mean_token_accuracy": 0.9809410843253136, + "num_tokens": 18718428.0, + "step": 17725 + }, + { + "entropy": 0.059776942255703035, + "epoch": 1.5652681937432482, + "grad_norm": 2.390625, + "learning_rate": 9.782479073739268e-05, + "loss": 0.07257438182830811, + "mean_token_accuracy": 0.9825310951471329, + "num_tokens": 18745938.0, + "step": 17750 + }, + { + "entropy": 0.05679215215088334, + "epoch": 1.5674728278842127, + "grad_norm": 1.46875, + "learning_rate": 9.758684678074594e-05, + "loss": 0.07460373878479004, + "mean_token_accuracy": 0.9839976826310157, + "num_tokens": 18771583.0, + "step": 17775 + }, + { + "entropy": 0.06317433909796819, + "epoch": 1.5696774620251768, + "grad_norm": 2.453125, + "learning_rate": 9.734891649392238e-05, + "loss": 0.08541607856750488, + "mean_token_accuracy": 0.980264983177185, + "num_tokens": 18799453.0, + "step": 17800 + }, + { + "entropy": 0.06873761809001736, + "epoch": 1.5718820961661413, + "grad_norm": 2.078125, + "learning_rate": 9.711100122472908e-05, + "loss": 0.10108554840087891, + "mean_token_accuracy": 0.9776660186052323, + "num_tokens": 18826048.0, + "step": 17825 + }, + { + "entropy": 0.048909812513666114, + "epoch": 1.5740867303071056, + "grad_norm": 2.53125, + "learning_rate": 9.6873102320888e-05, + "loss": 0.06565195560455322, + "mean_token_accuracy": 0.9847191327810287, + "num_tokens": 18852389.0, + "step": 17850 + }, + { + "entropy": 0.05157642891848809, + "epoch": 1.5762913644480698, + "grad_norm": 1.171875, + "learning_rate": 9.663522113002844e-05, + "loss": 0.08032276153564454, + "mean_token_accuracy": 0.9816439139842987, + "num_tokens": 18879043.0, + "step": 17875 + }, + { + "entropy": 0.048102616354735804, + "epoch": 1.578495998589034, + "grad_norm": 0.44140625, + "learning_rate": 9.639735899967931e-05, + "loss": 0.06498304843902587, + "mean_token_accuracy": 0.9843746635317803, + "num_tokens": 18905398.0, + "step": 17900 + }, + { + "entropy": 0.055048747494947745, + "epoch": 1.5807006327299984, + "grad_norm": 0.53515625, + "learning_rate": 9.615951727726162e-05, + "loss": 0.07438690662384033, + "mean_token_accuracy": 0.981828630566597, + "num_tokens": 18932068.0, + "step": 17925 + }, + { + "entropy": 0.05669259457528824, + "epoch": 1.582905266870963, + "grad_norm": 1.5703125, + "learning_rate": 9.592169731008076e-05, + "loss": 0.07773910522460938, + "mean_token_accuracy": 0.9835181996226311, + "num_tokens": 18959147.0, + "step": 17950 + }, + { + "entropy": 0.04661863596928015, + "epoch": 1.585109901011927, + "grad_norm": 0.54296875, + "learning_rate": 9.568390044531887e-05, + "loss": 0.06266080856323242, + "mean_token_accuracy": 0.9864483639597893, + "num_tokens": 18984022.0, + "step": 17975 + }, + { + "entropy": 0.05930834055492596, + "epoch": 1.5873145351528914, + "grad_norm": 0.70703125, + "learning_rate": 9.544612803002711e-05, + "loss": 0.09378492355346679, + "mean_token_accuracy": 0.9771989992260933, + "num_tokens": 19011238.0, + "step": 18000 + }, + { + "epoch": 1.5873145351528914, + "eval_entropy": 0.025065774224757285, + "eval_loss": 0.030993424355983734, + "eval_mean_token_accuracy": 0.9900934713175435, + "eval_num_tokens": 19011238.0, + "eval_runtime": 247.7334, + "eval_samples_per_second": 15.86, + "eval_steps_per_second": 3.968, + "step": 18000 + }, + { + "entropy": 0.05400558569199347, + "epoch": 1.5895191692938557, + "grad_norm": 3.8125, + "learning_rate": 9.520838141111833e-05, + "loss": 0.07712131023406982, + "mean_token_accuracy": 0.9816086456179619, + "num_tokens": 19037710.0, + "step": 18025 + }, + { + "entropy": 0.05443809665448498, + "epoch": 1.59172380343482, + "grad_norm": 0.26953125, + "learning_rate": 9.497066193535917e-05, + "loss": 0.08530388832092285, + "mean_token_accuracy": 0.981157968044281, + "num_tokens": 19064722.0, + "step": 18050 + }, + { + "entropy": 0.05699371125425387, + "epoch": 1.5939284375757843, + "grad_norm": 2.5, + "learning_rate": 9.473297094936247e-05, + "loss": 0.07788813591003418, + "mean_token_accuracy": 0.9828856268525124, + "num_tokens": 19090316.0, + "step": 18075 + }, + { + "entropy": 0.059593895952129966, + "epoch": 1.5961330717167486, + "grad_norm": 1.359375, + "learning_rate": 9.449530979957977e-05, + "loss": 0.07819770812988282, + "mean_token_accuracy": 0.9822254714369774, + "num_tokens": 19117146.0, + "step": 18100 + }, + { + "entropy": 0.06011313027673168, + "epoch": 1.598337705857713, + "grad_norm": 0.9453125, + "learning_rate": 9.425767983229346e-05, + "loss": 0.07340410232543945, + "mean_token_accuracy": 0.9806813663244247, + "num_tokens": 19144537.0, + "step": 18125 + }, + { + "entropy": 0.04968562399721122, + "epoch": 1.6005423399986771, + "grad_norm": 0.875, + "learning_rate": 9.402008239360944e-05, + "loss": 0.06933138847351074, + "mean_token_accuracy": 0.9852479723095894, + "num_tokens": 19170209.0, + "step": 18150 + }, + { + "entropy": 0.060237830583500906, + "epoch": 1.6027469741396416, + "grad_norm": 0.384765625, + "learning_rate": 9.378251882944932e-05, + "loss": 0.07463406085968018, + "mean_token_accuracy": 0.9815969231724739, + "num_tokens": 19196223.0, + "step": 18175 + }, + { + "entropy": 0.05468057305733964, + "epoch": 1.6049516082806057, + "grad_norm": 0.984375, + "learning_rate": 9.354499048554273e-05, + "loss": 0.07429322242736816, + "mean_token_accuracy": 0.9817409634590148, + "num_tokens": 19222714.0, + "step": 18200 + }, + { + "entropy": 0.050176919560835816, + "epoch": 1.6071562424215702, + "grad_norm": 1.0, + "learning_rate": 9.33074987074198e-05, + "loss": 0.07442711353302002, + "mean_token_accuracy": 0.9835793408751488, + "num_tokens": 19248965.0, + "step": 18225 + }, + { + "entropy": 0.05845318389037857, + "epoch": 1.6093608765625345, + "grad_norm": 3.171875, + "learning_rate": 9.307004484040361e-05, + "loss": 0.07511641502380371, + "mean_token_accuracy": 0.9822868245840073, + "num_tokens": 19274620.0, + "step": 18250 + }, + { + "entropy": 0.057679404538066595, + "epoch": 1.6115655107034987, + "grad_norm": 0.5390625, + "learning_rate": 9.28326302296025e-05, + "loss": 0.08816563606262207, + "mean_token_accuracy": 0.9820525646209717, + "num_tokens": 19301416.0, + "step": 18275 + }, + { + "entropy": 0.05478143262356752, + "epoch": 1.613770144844463, + "grad_norm": 2.703125, + "learning_rate": 9.259525621990227e-05, + "loss": 0.0690168571472168, + "mean_token_accuracy": 0.9814907485246658, + "num_tokens": 19328100.0, + "step": 18300 + }, + { + "entropy": 0.04510672772856197, + "epoch": 1.6159747789854273, + "grad_norm": 0.384765625, + "learning_rate": 9.235792415595887e-05, + "loss": 0.062098612785339354, + "mean_token_accuracy": 0.9843040198087692, + "num_tokens": 19353794.0, + "step": 18325 + }, + { + "entropy": 0.058022860687196955, + "epoch": 1.6181794131263918, + "grad_norm": 3.0, + "learning_rate": 9.212063538219059e-05, + "loss": 0.07701887130737305, + "mean_token_accuracy": 0.9803284251689911, + "num_tokens": 19381164.0, + "step": 18350 + }, + { + "entropy": 0.0572402674825571, + "epoch": 1.6203840472673559, + "grad_norm": 1.0546875, + "learning_rate": 9.188339124277056e-05, + "loss": 0.09371196746826171, + "mean_token_accuracy": 0.9804246252775193, + "num_tokens": 19408403.0, + "step": 18375 + }, + { + "entropy": 0.056358818953158335, + "epoch": 1.6225886814083204, + "grad_norm": 1.34375, + "learning_rate": 9.164619308161894e-05, + "loss": 0.07747916221618652, + "mean_token_accuracy": 0.9822709369659424, + "num_tokens": 19435431.0, + "step": 18400 + }, + { + "entropy": 0.05615487261035014, + "epoch": 1.6247933155492846, + "grad_norm": 0.65234375, + "learning_rate": 9.140904224239555e-05, + "loss": 0.0793535327911377, + "mean_token_accuracy": 0.9820253443717957, + "num_tokens": 19461362.0, + "step": 18425 + }, + { + "entropy": 0.04601802124358073, + "epoch": 1.626997949690249, + "grad_norm": 0.462890625, + "learning_rate": 9.117194006849207e-05, + "loss": 0.057888431549072264, + "mean_token_accuracy": 0.9839033776521683, + "num_tokens": 19487621.0, + "step": 18450 + }, + { + "entropy": 0.05517784896073863, + "epoch": 1.6292025838312132, + "grad_norm": 3.265625, + "learning_rate": 9.09348879030246e-05, + "loss": 0.06926989078521728, + "mean_token_accuracy": 0.9825824412703514, + "num_tokens": 19514533.0, + "step": 18475 + }, + { + "entropy": 0.05407030944246799, + "epoch": 1.6314072179721775, + "grad_norm": 1.6875, + "learning_rate": 9.069788708882582e-05, + "loss": 0.06324063777923584, + "mean_token_accuracy": 0.9833697432279587, + "num_tokens": 19540686.0, + "step": 18500 + }, + { + "entropy": 0.049242617936106396, + "epoch": 1.633611852113142, + "grad_norm": 0.98046875, + "learning_rate": 9.046093896843764e-05, + "loss": 0.07035034656524658, + "mean_token_accuracy": 0.982454896569252, + "num_tokens": 19566734.0, + "step": 18525 + }, + { + "entropy": 0.05330205393038341, + "epoch": 1.635816486254106, + "grad_norm": 1.6484375, + "learning_rate": 9.02240448841034e-05, + "loss": 0.07815911769866943, + "mean_token_accuracy": 0.9762992030382156, + "num_tokens": 19592970.0, + "step": 18550 + }, + { + "entropy": 0.060869988883641785, + "epoch": 1.6380211203950705, + "grad_norm": 0.875, + "learning_rate": 8.998720617776044e-05, + "loss": 0.08916851043701172, + "mean_token_accuracy": 0.9811286148428917, + "num_tokens": 19619172.0, + "step": 18575 + }, + { + "entropy": 0.06743482278645388, + "epoch": 1.6402257545360346, + "grad_norm": 1.9296875, + "learning_rate": 8.975042419103222e-05, + "loss": 0.09006739616394042, + "mean_token_accuracy": 0.9787856066226959, + "num_tokens": 19645154.0, + "step": 18600 + }, + { + "entropy": 0.06285342701012268, + "epoch": 1.642430388676999, + "grad_norm": 1.7265625, + "learning_rate": 8.951370026522109e-05, + "loss": 0.0924040412902832, + "mean_token_accuracy": 0.9793640455603599, + "num_tokens": 19672192.0, + "step": 18625 + }, + { + "entropy": 0.05975484314971254, + "epoch": 1.6446350228179634, + "grad_norm": 3.296875, + "learning_rate": 8.92770357413004e-05, + "loss": 0.07495357990264892, + "mean_token_accuracy": 0.982607415318489, + "num_tokens": 19698226.0, + "step": 18650 + }, + { + "entropy": 0.04880306238286721, + "epoch": 1.6468396569589276, + "grad_norm": 2.015625, + "learning_rate": 8.904043195990707e-05, + "loss": 0.06099768161773682, + "mean_token_accuracy": 0.9863478910923004, + "num_tokens": 19724505.0, + "step": 18675 + }, + { + "entropy": 0.05574701564350107, + "epoch": 1.649044291099892, + "grad_norm": 2.65625, + "learning_rate": 8.880389026133378e-05, + "loss": 0.09057734489440918, + "mean_token_accuracy": 0.9812796249985695, + "num_tokens": 19750809.0, + "step": 18700 + }, + { + "entropy": 0.05721397206405527, + "epoch": 1.6512489252408562, + "grad_norm": 0.87890625, + "learning_rate": 8.856741198552171e-05, + "loss": 0.09432357788085938, + "mean_token_accuracy": 0.9806243237853051, + "num_tokens": 19776758.0, + "step": 18725 + }, + { + "entropy": 0.051816781606976295, + "epoch": 1.6534535593818207, + "grad_norm": 4.65625, + "learning_rate": 8.83309984720527e-05, + "loss": 0.0762649917602539, + "mean_token_accuracy": 0.9829356080293655, + "num_tokens": 19801783.0, + "step": 18750 + }, + { + "entropy": 0.05103291466111841, + "epoch": 1.6556581935227848, + "grad_norm": 3.171875, + "learning_rate": 8.809465106014173e-05, + "loss": 0.07494559288024902, + "mean_token_accuracy": 0.9815609979629517, + "num_tokens": 19828390.0, + "step": 18775 + }, + { + "entropy": 0.05235413030990457, + "epoch": 1.6578628276637493, + "grad_norm": 0.67578125, + "learning_rate": 8.785837108862926e-05, + "loss": 0.06882952213287354, + "mean_token_accuracy": 0.9843146482110023, + "num_tokens": 19854446.0, + "step": 18800 + }, + { + "entropy": 0.050876114333077566, + "epoch": 1.6600674618047135, + "grad_norm": 1.7734375, + "learning_rate": 8.762215989597383e-05, + "loss": 0.07045453071594238, + "mean_token_accuracy": 0.9855261752009392, + "num_tokens": 19879183.0, + "step": 18825 + }, + { + "entropy": 0.04869757466043666, + "epoch": 1.6622720959456778, + "grad_norm": 1.1875, + "learning_rate": 8.738601882024435e-05, + "loss": 0.0628709602355957, + "mean_token_accuracy": 0.9841243025660514, + "num_tokens": 19905109.0, + "step": 18850 + }, + { + "entropy": 0.04798712164454628, + "epoch": 1.664476730086642, + "grad_norm": 0.6953125, + "learning_rate": 8.714994919911247e-05, + "loss": 0.07689383506774902, + "mean_token_accuracy": 0.9856678175926209, + "num_tokens": 19932085.0, + "step": 18875 + }, + { + "entropy": 0.05964564758811321, + "epoch": 1.6666813642276064, + "grad_norm": 2.328125, + "learning_rate": 8.691395236984516e-05, + "loss": 0.08691888809204101, + "mean_token_accuracy": 0.9784200477600098, + "num_tokens": 19960684.0, + "step": 18900 + }, + { + "entropy": 0.05663166692873347, + "epoch": 1.6688859983685709, + "grad_norm": 2.46875, + "learning_rate": 8.667802966929694e-05, + "loss": 0.06777063846588134, + "mean_token_accuracy": 0.9836448773741722, + "num_tokens": 19986876.0, + "step": 18925 + }, + { + "entropy": 0.04777500177915499, + "epoch": 1.671090632509535, + "grad_norm": 0.1982421875, + "learning_rate": 8.644218243390248e-05, + "loss": 0.06499977588653565, + "mean_token_accuracy": 0.9848168486356735, + "num_tokens": 20012694.0, + "step": 18950 + }, + { + "entropy": 0.04975124435048201, + "epoch": 1.6732952666504994, + "grad_norm": 1.8515625, + "learning_rate": 8.620641199966901e-05, + "loss": 0.06578896045684815, + "mean_token_accuracy": 0.9852525860071182, + "num_tokens": 20039158.0, + "step": 18975 + }, + { + "entropy": 0.06006684680622129, + "epoch": 1.6754999007914635, + "grad_norm": 1.4375, + "learning_rate": 8.597071970216861e-05, + "loss": 0.08604469299316406, + "mean_token_accuracy": 0.9798215964436531, + "num_tokens": 20066712.0, + "step": 19000 + }, + { + "epoch": 1.6754999007914635, + "eval_entropy": 0.02392154005389293, + "eval_loss": 0.027977393940091133, + "eval_mean_token_accuracy": 0.9912456352858674, + "eval_num_tokens": 20066712.0, + "eval_runtime": 244.0079, + "eval_samples_per_second": 16.102, + "eval_steps_per_second": 4.029, + "step": 19000 + }, + { + "entropy": 0.0617493161274615, + "epoch": 1.677704534932428, + "grad_norm": 0.7109375, + "learning_rate": 8.573510687653072e-05, + "loss": 0.08513182640075684, + "mean_token_accuracy": 0.9791978281736374, + "num_tokens": 20093148.0, + "step": 19025 + }, + { + "entropy": 0.058141036685192375, + "epoch": 1.6799091690733923, + "grad_norm": 1.1171875, + "learning_rate": 8.549957485743467e-05, + "loss": 0.0842889404296875, + "mean_token_accuracy": 0.9809462291002273, + "num_tokens": 20119244.0, + "step": 19050 + }, + { + "entropy": 0.062483344532229236, + "epoch": 1.6821138032143566, + "grad_norm": 3.921875, + "learning_rate": 8.526412497910208e-05, + "loss": 0.08513738632202149, + "mean_token_accuracy": 0.9823807805776597, + "num_tokens": 20145783.0, + "step": 19075 + }, + { + "entropy": 0.047624338713576436, + "epoch": 1.6843184373553208, + "grad_norm": 3.453125, + "learning_rate": 8.502875857528916e-05, + "loss": 0.05738876819610596, + "mean_token_accuracy": 0.9847044974565506, + "num_tokens": 20173172.0, + "step": 19100 + }, + { + "entropy": 0.05098998494475381, + "epoch": 1.686523071496285, + "grad_norm": 1.8359375, + "learning_rate": 8.47934769792793e-05, + "loss": 0.06551202774047851, + "mean_token_accuracy": 0.9852928957343101, + "num_tokens": 20200153.0, + "step": 19125 + }, + { + "entropy": 0.057541024469246624, + "epoch": 1.6887277056372496, + "grad_norm": 1.375, + "learning_rate": 8.45582815238755e-05, + "loss": 0.08337347984313964, + "mean_token_accuracy": 0.9810384732484817, + "num_tokens": 20228509.0, + "step": 19150 + }, + { + "entropy": 0.052381179301446534, + "epoch": 1.6909323397782137, + "grad_norm": 2.453125, + "learning_rate": 8.432317354139276e-05, + "loss": 0.07328851222991943, + "mean_token_accuracy": 0.9824663576483726, + "num_tokens": 20254949.0, + "step": 19175 + }, + { + "entropy": 0.057044620190863496, + "epoch": 1.6931369739191782, + "grad_norm": 0.70703125, + "learning_rate": 8.408815436365066e-05, + "loss": 0.06466145038604737, + "mean_token_accuracy": 0.9842689520120621, + "num_tokens": 20282270.0, + "step": 19200 + }, + { + "entropy": 0.044989108476002, + "epoch": 1.6953416080601424, + "grad_norm": 0.310546875, + "learning_rate": 8.38532253219656e-05, + "loss": 0.05739119529724121, + "mean_token_accuracy": 0.9873190036416054, + "num_tokens": 20308285.0, + "step": 19225 + }, + { + "entropy": 0.05179535857627343, + "epoch": 1.6975462422011067, + "grad_norm": 1.4609375, + "learning_rate": 8.361838774714343e-05, + "loss": 0.07189542770385743, + "mean_token_accuracy": 0.9823496028780937, + "num_tokens": 20335541.0, + "step": 19250 + }, + { + "entropy": 0.04700690143959946, + "epoch": 1.699750876342071, + "grad_norm": 1.7109375, + "learning_rate": 8.338364296947192e-05, + "loss": 0.06394123077392579, + "mean_token_accuracy": 0.983540931046009, + "num_tokens": 20361464.0, + "step": 19275 + }, + { + "entropy": 0.05009625876868085, + "epoch": 1.7019555104830353, + "grad_norm": 1.1015625, + "learning_rate": 8.314899231871316e-05, + "loss": 0.0703705358505249, + "mean_token_accuracy": 0.9844644451141358, + "num_tokens": 20387924.0, + "step": 19300 + }, + { + "entropy": 0.05482434216442925, + "epoch": 1.7041601446239998, + "grad_norm": 0.2216796875, + "learning_rate": 8.291443712409595e-05, + "loss": 0.0688732099533081, + "mean_token_accuracy": 0.9761506044864654, + "num_tokens": 20414494.0, + "step": 19325 + }, + { + "entropy": 0.05794863952425658, + "epoch": 1.7063647787649638, + "grad_norm": 2.71875, + "learning_rate": 8.267997871430844e-05, + "loss": 0.07615211009979247, + "mean_token_accuracy": 0.9809840077161789, + "num_tokens": 20440990.0, + "step": 19350 + }, + { + "entropy": 0.03766856579939486, + "epoch": 1.7085694129059283, + "grad_norm": 1.1171875, + "learning_rate": 8.244561841749048e-05, + "loss": 0.0410503339767456, + "mean_token_accuracy": 0.98747672945261, + "num_tokens": 20466090.0, + "step": 19375 + }, + { + "entropy": 0.048852927869920675, + "epoch": 1.7107740470468926, + "grad_norm": 2.59375, + "learning_rate": 8.221135756122625e-05, + "loss": 0.0601063871383667, + "mean_token_accuracy": 0.9837045565247535, + "num_tokens": 20492618.0, + "step": 19400 + }, + { + "entropy": 0.044831587293992926, + "epoch": 1.712978681187857, + "grad_norm": 1.34375, + "learning_rate": 8.19771974725364e-05, + "loss": 0.05813504695892334, + "mean_token_accuracy": 0.9861867216229439, + "num_tokens": 20518674.0, + "step": 19425 + }, + { + "entropy": 0.059993671221309344, + "epoch": 1.7151833153288212, + "grad_norm": 1.9609375, + "learning_rate": 8.1743139477871e-05, + "loss": 0.07705019474029541, + "mean_token_accuracy": 0.9792517521977424, + "num_tokens": 20545279.0, + "step": 19450 + }, + { + "entropy": 0.05164817209908506, + "epoch": 1.7173879494697855, + "grad_norm": 0.96875, + "learning_rate": 8.150918490310163e-05, + "loss": 0.060844712257385254, + "mean_token_accuracy": 0.9835267442464829, + "num_tokens": 20572166.0, + "step": 19475 + }, + { + "entropy": 0.0489783468087262, + "epoch": 1.71959258361075, + "grad_norm": 2.125, + "learning_rate": 8.127533507351415e-05, + "loss": 0.06445716381072998, + "mean_token_accuracy": 0.9832936239242553, + "num_tokens": 20598836.0, + "step": 19500 + }, + { + "entropy": 0.052891870438106704, + "epoch": 1.721797217751714, + "grad_norm": 1.4375, + "learning_rate": 8.104159131380089e-05, + "loss": 0.07390643119812011, + "mean_token_accuracy": 0.9825085845589637, + "num_tokens": 20625740.0, + "step": 19525 + }, + { + "entropy": 0.04920050672066281, + "epoch": 1.7240018518926785, + "grad_norm": 1.578125, + "learning_rate": 8.080795494805351e-05, + "loss": 0.06204095840454102, + "mean_token_accuracy": 0.9850746482610703, + "num_tokens": 20652602.0, + "step": 19550 + }, + { + "entropy": 0.053812768571224294, + "epoch": 1.7262064860336426, + "grad_norm": 1.2578125, + "learning_rate": 8.057442729975518e-05, + "loss": 0.07690982341766357, + "mean_token_accuracy": 0.9800240156054497, + "num_tokens": 20678867.0, + "step": 19575 + }, + { + "entropy": 0.04677354415842274, + "epoch": 1.728411120174607, + "grad_norm": 0.8359375, + "learning_rate": 8.034100969177337e-05, + "loss": 0.05719516754150391, + "mean_token_accuracy": 0.9842693316936493, + "num_tokens": 20705807.0, + "step": 19600 + }, + { + "entropy": 0.048505159118867595, + "epoch": 1.7306157543155714, + "grad_norm": 0.7578125, + "learning_rate": 8.010770344635199e-05, + "loss": 0.06632734775543213, + "mean_token_accuracy": 0.984979897737503, + "num_tokens": 20733283.0, + "step": 19625 + }, + { + "entropy": 0.04785260035292595, + "epoch": 1.7328203884565356, + "grad_norm": 2.890625, + "learning_rate": 7.987450988510427e-05, + "loss": 0.0693046760559082, + "mean_token_accuracy": 0.9795151484012604, + "num_tokens": 20760048.0, + "step": 19650 + }, + { + "entropy": 0.047811845683754656, + "epoch": 1.7350250225975, + "grad_norm": 2.140625, + "learning_rate": 7.964143032900513e-05, + "loss": 0.07035356521606445, + "mean_token_accuracy": 0.9821845233440399, + "num_tokens": 20784985.0, + "step": 19675 + }, + { + "entropy": 0.04927277403214248, + "epoch": 1.7372296567384642, + "grad_norm": 0.369140625, + "learning_rate": 7.94084660983836e-05, + "loss": 0.06305365085601806, + "mean_token_accuracy": 0.9862342929840088, + "num_tokens": 20811868.0, + "step": 19700 + }, + { + "entropy": 0.05829433169667027, + "epoch": 1.7394342908794287, + "grad_norm": 0.302734375, + "learning_rate": 7.917561851291538e-05, + "loss": 0.0748593282699585, + "mean_token_accuracy": 0.9797840613126755, + "num_tokens": 20839057.0, + "step": 19725 + }, + { + "entropy": 0.04679221799589868, + "epoch": 1.7416389250203927, + "grad_norm": 1.234375, + "learning_rate": 7.894288889161554e-05, + "loss": 0.06042766571044922, + "mean_token_accuracy": 0.9846546384692192, + "num_tokens": 20864043.0, + "step": 19750 + }, + { + "entropy": 0.05859222490020329, + "epoch": 1.7438435591613572, + "grad_norm": 0.703125, + "learning_rate": 7.871027855283088e-05, + "loss": 0.09572078704833985, + "mean_token_accuracy": 0.9796414789557457, + "num_tokens": 20891095.0, + "step": 19775 + }, + { + "entropy": 0.0481031046868884, + "epoch": 1.7460481933023215, + "grad_norm": 1.0859375, + "learning_rate": 7.847778881423247e-05, + "loss": 0.06286589622497558, + "mean_token_accuracy": 0.9845766887068749, + "num_tokens": 20916691.0, + "step": 19800 + }, + { + "entropy": 0.048571501801998235, + "epoch": 1.7482528274432858, + "grad_norm": 1.5859375, + "learning_rate": 7.824542099280817e-05, + "loss": 0.06077055931091309, + "mean_token_accuracy": 0.9858067938685418, + "num_tokens": 20943403.0, + "step": 19825 + }, + { + "entropy": 0.053936819200171154, + "epoch": 1.75045746158425, + "grad_norm": 3.109375, + "learning_rate": 7.801317640485528e-05, + "loss": 0.06340930938720703, + "mean_token_accuracy": 0.9828820812702179, + "num_tokens": 20969337.0, + "step": 19850 + }, + { + "entropy": 0.05901100598319317, + "epoch": 1.7526620957252144, + "grad_norm": 2.359375, + "learning_rate": 7.778105636597305e-05, + "loss": 0.08228662490844726, + "mean_token_accuracy": 0.9798002752661705, + "num_tokens": 20997335.0, + "step": 19875 + }, + { + "entropy": 0.04519621968145657, + "epoch": 1.7548667298661789, + "grad_norm": 0.296875, + "learning_rate": 7.75490621910551e-05, + "loss": 0.05941192150115967, + "mean_token_accuracy": 0.9843781432509422, + "num_tokens": 21024018.0, + "step": 19900 + }, + { + "entropy": 0.054847391085204436, + "epoch": 1.757071364007143, + "grad_norm": 0.361328125, + "learning_rate": 7.731719519428217e-05, + "loss": 0.0830865478515625, + "mean_token_accuracy": 0.9809887805581092, + "num_tokens": 21050891.0, + "step": 19925 + }, + { + "entropy": 0.06024816707678838, + "epoch": 1.7592759981481074, + "grad_norm": 1.09375, + "learning_rate": 7.708545668911443e-05, + "loss": 0.0814453887939453, + "mean_token_accuracy": 0.9796580225229263, + "num_tokens": 21079502.0, + "step": 19950 + }, + { + "entropy": 0.056985975124116524, + "epoch": 1.7614806322890715, + "grad_norm": 0.408203125, + "learning_rate": 7.685384798828432e-05, + "loss": 0.07116193771362304, + "mean_token_accuracy": 0.9817749384045601, + "num_tokens": 21105896.0, + "step": 19975 + }, + { + "entropy": 0.04627459083298163, + "epoch": 1.763685266430036, + "grad_norm": 1.1875, + "learning_rate": 7.662237040378895e-05, + "loss": 0.06480787754058838, + "mean_token_accuracy": 0.9857170182466507, + "num_tokens": 21132663.0, + "step": 20000 + }, + { + "epoch": 1.763685266430036, + "eval_entropy": 0.023058283942197815, + "eval_loss": 0.026535965502262115, + "eval_mean_token_accuracy": 0.9916715127162992, + "eval_num_tokens": 21132663.0, + "eval_runtime": 242.6864, + "eval_samples_per_second": 16.19, + "eval_steps_per_second": 4.05, + "step": 20000 + }, + { + "entropy": 0.04721232044874341, + "epoch": 1.7658899005710003, + "grad_norm": 0.99609375, + "learning_rate": 7.639102524688265e-05, + "loss": 0.05121193885803223, + "mean_token_accuracy": 0.9868710726499558, + "num_tokens": 21158041.0, + "step": 20025 + }, + { + "entropy": 0.04733763427138911, + "epoch": 1.7680945347119645, + "grad_norm": 2.453125, + "learning_rate": 7.615981382806956e-05, + "loss": 0.06285818576812745, + "mean_token_accuracy": 0.9839335259795189, + "num_tokens": 21183629.0, + "step": 20050 + }, + { + "entropy": 0.04657574629578448, + "epoch": 1.7702991688529288, + "grad_norm": 3.3125, + "learning_rate": 7.59287374570963e-05, + "loss": 0.07313352108001708, + "mean_token_accuracy": 0.984621779024601, + "num_tokens": 21209670.0, + "step": 20075 + }, + { + "entropy": 0.04935747715455364, + "epoch": 1.772503802993893, + "grad_norm": 1.203125, + "learning_rate": 7.569779744294447e-05, + "loss": 0.06564640998840332, + "mean_token_accuracy": 0.9850915068387985, + "num_tokens": 21235640.0, + "step": 20100 + }, + { + "entropy": 0.04392121256401879, + "epoch": 1.7747084371348576, + "grad_norm": 0.9453125, + "learning_rate": 7.546699509382324e-05, + "loss": 0.06469783306121826, + "mean_token_accuracy": 0.9833244362473488, + "num_tokens": 21261662.0, + "step": 20125 + }, + { + "entropy": 0.04485419812590408, + "epoch": 1.7769130712758217, + "grad_norm": 1.8125, + "learning_rate": 7.523633171716194e-05, + "loss": 0.05138749122619629, + "mean_token_accuracy": 0.985532431602478, + "num_tokens": 21288209.0, + "step": 20150 + }, + { + "entropy": 0.046874258878851834, + "epoch": 1.7791177054167862, + "grad_norm": 1.75, + "learning_rate": 7.50058086196026e-05, + "loss": 0.06949950218200683, + "mean_token_accuracy": 0.9838392865657807, + "num_tokens": 21314032.0, + "step": 20175 + }, + { + "entropy": 0.054706107557649375, + "epoch": 1.7813223395577504, + "grad_norm": 2.59375, + "learning_rate": 7.477542710699275e-05, + "loss": 0.07922077655792237, + "mean_token_accuracy": 0.9834457024931907, + "num_tokens": 21339944.0, + "step": 20200 + }, + { + "entropy": 0.05576091543363873, + "epoch": 1.7835269736987147, + "grad_norm": 1.859375, + "learning_rate": 7.454518848437782e-05, + "loss": 0.07751681327819825, + "mean_token_accuracy": 0.9836757770180702, + "num_tokens": 21366032.0, + "step": 20225 + }, + { + "entropy": 0.06782450357182825, + "epoch": 1.785731607839679, + "grad_norm": 1.3671875, + "learning_rate": 7.43150940559937e-05, + "loss": 0.09584000587463379, + "mean_token_accuracy": 0.9782721415162087, + "num_tokens": 21393287.0, + "step": 20250 + }, + { + "entropy": 0.050107013575034214, + "epoch": 1.7879362419806433, + "grad_norm": 2.734375, + "learning_rate": 7.408514512525961e-05, + "loss": 0.06766324996948242, + "mean_token_accuracy": 0.9848771205544472, + "num_tokens": 21419444.0, + "step": 20275 + }, + { + "entropy": 0.04845013963735255, + "epoch": 1.7901408761216078, + "grad_norm": 1.1953125, + "learning_rate": 7.385534299477049e-05, + "loss": 0.0615593433380127, + "mean_token_accuracy": 0.9860979354381562, + "num_tokens": 21444863.0, + "step": 20300 + }, + { + "entropy": 0.04458080105272529, + "epoch": 1.7923455102625718, + "grad_norm": 1.4296875, + "learning_rate": 7.362568896628977e-05, + "loss": 0.05469425678253174, + "mean_token_accuracy": 0.9860287711024285, + "num_tokens": 21471190.0, + "step": 20325 + }, + { + "entropy": 0.048252884604953576, + "epoch": 1.7945501444035363, + "grad_norm": 1.828125, + "learning_rate": 7.339618434074182e-05, + "loss": 0.07198523044586182, + "mean_token_accuracy": 0.9825797024369239, + "num_tokens": 21497177.0, + "step": 20350 + }, + { + "entropy": 0.0426078699176287, + "epoch": 1.7967547785445004, + "grad_norm": 0.9453125, + "learning_rate": 7.316683041820474e-05, + "loss": 0.07130131244659424, + "mean_token_accuracy": 0.9853621581196785, + "num_tokens": 21521844.0, + "step": 20375 + }, + { + "entropy": 0.04121265197856701, + "epoch": 1.7989594126854649, + "grad_norm": 0.419921875, + "learning_rate": 7.293762849790294e-05, + "loss": 0.055575294494628905, + "mean_token_accuracy": 0.9851228359341622, + "num_tokens": 21546943.0, + "step": 20400 + }, + { + "entropy": 0.04701503006057464, + "epoch": 1.8011640468264292, + "grad_norm": 0.21875, + "learning_rate": 7.270857987819984e-05, + "loss": 0.061670899391174316, + "mean_token_accuracy": 0.9868606904149055, + "num_tokens": 21572935.0, + "step": 20425 + }, + { + "entropy": 0.04451014851059881, + "epoch": 1.8033686809673934, + "grad_norm": 3.21875, + "learning_rate": 7.247968585659032e-05, + "loss": 0.060894203186035153, + "mean_token_accuracy": 0.9842220428586006, + "num_tokens": 21599217.0, + "step": 20450 + }, + { + "entropy": 0.059572365196509054, + "epoch": 1.8055733151083577, + "grad_norm": 2.34375, + "learning_rate": 7.225094772969361e-05, + "loss": 0.07416722774505616, + "mean_token_accuracy": 0.9822189456224442, + "num_tokens": 21626925.0, + "step": 20475 + }, + { + "entropy": 0.046611685622046935, + "epoch": 1.807777949249322, + "grad_norm": 0.828125, + "learning_rate": 7.202236679324581e-05, + "loss": 0.06417181015014649, + "mean_token_accuracy": 0.9834126874804496, + "num_tokens": 21653311.0, + "step": 20500 + }, + { + "entropy": 0.04335411591440788, + "epoch": 1.8099825833902865, + "grad_norm": 0.8359375, + "learning_rate": 7.179394434209264e-05, + "loss": 0.058134937286376955, + "mean_token_accuracy": 0.9870638877153397, + "num_tokens": 21677976.0, + "step": 20525 + }, + { + "entropy": 0.05337048692628741, + "epoch": 1.8121872175312506, + "grad_norm": 1.1796875, + "learning_rate": 7.156568167018187e-05, + "loss": 0.06563668727874755, + "mean_token_accuracy": 0.9823853915929794, + "num_tokens": 21705733.0, + "step": 20550 + }, + { + "entropy": 0.03864829636509967, + "epoch": 1.814391851672215, + "grad_norm": 0.0947265625, + "learning_rate": 7.133758007055639e-05, + "loss": 0.05145081520080566, + "mean_token_accuracy": 0.9869194403290749, + "num_tokens": 21731242.0, + "step": 20575 + }, + { + "entropy": 0.0520756857453307, + "epoch": 1.8165964858131793, + "grad_norm": 9.375, + "learning_rate": 7.110964083534651e-05, + "loss": 0.07184930324554444, + "mean_token_accuracy": 0.9811625573039054, + "num_tokens": 21757526.0, + "step": 20600 + }, + { + "entropy": 0.0557931135634135, + "epoch": 1.8188011199541436, + "grad_norm": 0.74609375, + "learning_rate": 7.088186525576289e-05, + "loss": 0.07704683303833008, + "mean_token_accuracy": 0.9804368880391121, + "num_tokens": 21783946.0, + "step": 20625 + }, + { + "entropy": 0.04322621882762178, + "epoch": 1.821005754095108, + "grad_norm": 0.1494140625, + "learning_rate": 7.0654254622089e-05, + "loss": 0.05452206611633301, + "mean_token_accuracy": 0.9864040830731392, + "num_tokens": 21809770.0, + "step": 20650 + }, + { + "entropy": 0.04771335199315217, + "epoch": 1.8232103882360722, + "grad_norm": 0.59375, + "learning_rate": 7.042681022367406e-05, + "loss": 0.06347963333129883, + "mean_token_accuracy": 0.9841774880886078, + "num_tokens": 21835665.0, + "step": 20675 + }, + { + "entropy": 0.053830798321432664, + "epoch": 1.8254150223770367, + "grad_norm": 1.8515625, + "learning_rate": 7.019953334892557e-05, + "loss": 0.07136422634124756, + "mean_token_accuracy": 0.9813834890723229, + "num_tokens": 21862527.0, + "step": 20700 + }, + { + "entropy": 0.04650898744526785, + "epoch": 1.8276196565180007, + "grad_norm": 1.7109375, + "learning_rate": 6.99724252853021e-05, + "loss": 0.05398883819580078, + "mean_token_accuracy": 0.9853073519468307, + "num_tokens": 21889036.0, + "step": 20725 + }, + { + "entropy": 0.05225651402957737, + "epoch": 1.8298242906589652, + "grad_norm": 1.1328125, + "learning_rate": 6.974548731930582e-05, + "loss": 0.06863418102264404, + "mean_token_accuracy": 0.9822486186027527, + "num_tokens": 21916165.0, + "step": 20750 + }, + { + "entropy": 0.055327791996278394, + "epoch": 1.8320289247999293, + "grad_norm": 1.8203125, + "learning_rate": 6.951872073647546e-05, + "loss": 0.06615938663482666, + "mean_token_accuracy": 0.9840937641263008, + "num_tokens": 21944120.0, + "step": 20775 + }, + { + "entropy": 0.04494785757124191, + "epoch": 1.8342335589408938, + "grad_norm": 2.265625, + "learning_rate": 6.929212682137896e-05, + "loss": 0.05958599090576172, + "mean_token_accuracy": 0.9841647908091545, + "num_tokens": 21970158.0, + "step": 20800 + }, + { + "entropy": 0.04708563433858217, + "epoch": 1.836438193081858, + "grad_norm": 1.375, + "learning_rate": 6.906570685760602e-05, + "loss": 0.05993311405181885, + "mean_token_accuracy": 0.9865586760640145, + "num_tokens": 21995764.0, + "step": 20825 + }, + { + "entropy": 0.05176374662743911, + "epoch": 1.8386428272228224, + "grad_norm": 0.58203125, + "learning_rate": 6.8839462127761e-05, + "loss": 0.07699875831604004, + "mean_token_accuracy": 0.9810626646876335, + "num_tokens": 22022035.0, + "step": 20850 + }, + { + "entropy": 0.045616496206566805, + "epoch": 1.8408474613637866, + "grad_norm": 2.5, + "learning_rate": 6.861339391345563e-05, + "loss": 0.06166870594024658, + "mean_token_accuracy": 0.9853792524337769, + "num_tokens": 22048310.0, + "step": 20875 + }, + { + "entropy": 0.05039491361167166, + "epoch": 1.843052095504751, + "grad_norm": 1.6875, + "learning_rate": 6.838750349530175e-05, + "loss": 0.06637926578521729, + "mean_token_accuracy": 0.9829424172639847, + "num_tokens": 22074551.0, + "step": 20900 + }, + { + "entropy": 0.04769486486184178, + "epoch": 1.8452567296457154, + "grad_norm": 1.0234375, + "learning_rate": 6.8161792152904e-05, + "loss": 0.05665205478668213, + "mean_token_accuracy": 0.9840876114368439, + "num_tokens": 22100230.0, + "step": 20925 + }, + { + "entropy": 0.0528660134456004, + "epoch": 1.8474613637866795, + "grad_norm": 1.546875, + "learning_rate": 6.793626116485261e-05, + "loss": 0.07378475189208984, + "mean_token_accuracy": 0.9837013658881187, + "num_tokens": 22125633.0, + "step": 20950 + }, + { + "entropy": 0.05890303898835555, + "epoch": 1.849665997927644, + "grad_norm": 1.375, + "learning_rate": 6.771091180871611e-05, + "loss": 0.0943376350402832, + "mean_token_accuracy": 0.9790048637986183, + "num_tokens": 22152592.0, + "step": 20975 + }, + { + "entropy": 0.04344821692240657, + "epoch": 1.8518706320686082, + "grad_norm": 3.375, + "learning_rate": 6.748574536103424e-05, + "loss": 0.05888943672180176, + "mean_token_accuracy": 0.98219693005085, + "num_tokens": 22178877.0, + "step": 21000 + }, + { + "epoch": 1.8518706320686082, + "eval_entropy": 0.022571654543839585, + "eval_loss": 0.025132818147540092, + "eval_mean_token_accuracy": 0.992005098508794, + "eval_num_tokens": 22178877.0, + "eval_runtime": 243.2087, + "eval_samples_per_second": 16.155, + "eval_steps_per_second": 4.042, + "step": 21000 + }, + { + "entropy": 0.0381835427560145, + "epoch": 1.8540752662095725, + "grad_norm": 0.376953125, + "learning_rate": 6.726076309731056e-05, + "loss": 0.040510034561157225, + "mean_token_accuracy": 0.9873276236653328, + "num_tokens": 22202989.0, + "step": 21025 + }, + { + "entropy": 0.05477136584289838, + "epoch": 1.8562799003505368, + "grad_norm": 2.234375, + "learning_rate": 6.70359662920053e-05, + "loss": 0.06961938858032227, + "mean_token_accuracy": 0.9818892487883568, + "num_tokens": 22229607.0, + "step": 21050 + }, + { + "entropy": 0.060442834978384784, + "epoch": 1.858484534491501, + "grad_norm": 2.203125, + "learning_rate": 6.681135621852803e-05, + "loss": 0.08862739562988281, + "mean_token_accuracy": 0.9802686884999275, + "num_tokens": 22257392.0, + "step": 21075 + }, + { + "entropy": 0.04568748531426536, + "epoch": 1.8606891686324656, + "grad_norm": 2.515625, + "learning_rate": 6.658693414923064e-05, + "loss": 0.05948817729949951, + "mean_token_accuracy": 0.986970128417015, + "num_tokens": 22283172.0, + "step": 21100 + }, + { + "entropy": 0.05226036834908882, + "epoch": 1.8628938027734296, + "grad_norm": 1.625, + "learning_rate": 6.636270135540004e-05, + "loss": 0.06003546714782715, + "mean_token_accuracy": 0.984726087152958, + "num_tokens": 22310102.0, + "step": 21125 + }, + { + "entropy": 0.045168612728011794, + "epoch": 1.8650984369143941, + "grad_norm": 0.828125, + "learning_rate": 6.613865910725088e-05, + "loss": 0.05067370414733887, + "mean_token_accuracy": 0.9873959225416183, + "num_tokens": 22334866.0, + "step": 21150 + }, + { + "entropy": 0.05355773056653561, + "epoch": 1.8673030710553582, + "grad_norm": 1.6875, + "learning_rate": 6.591480867391846e-05, + "loss": 0.07002598285675049, + "mean_token_accuracy": 0.9833967351913452, + "num_tokens": 22361930.0, + "step": 21175 + }, + { + "entropy": 0.04413744929035602, + "epoch": 1.8695077051963227, + "grad_norm": 1.40625, + "learning_rate": 6.569115132345147e-05, + "loss": 0.0611871862411499, + "mean_token_accuracy": 0.9877374297380448, + "num_tokens": 22388635.0, + "step": 21200 + }, + { + "entropy": 0.043995544389399587, + "epoch": 1.871712339337287, + "grad_norm": 2.40625, + "learning_rate": 6.546768832280488e-05, + "loss": 0.06620774745941162, + "mean_token_accuracy": 0.9850053295493126, + "num_tokens": 22415102.0, + "step": 21225 + }, + { + "entropy": 0.056586300741619196, + "epoch": 1.8739169734782513, + "grad_norm": 1.609375, + "learning_rate": 6.524442093783278e-05, + "loss": 0.08194119453430176, + "mean_token_accuracy": 0.9824917709827423, + "num_tokens": 22441198.0, + "step": 21250 + }, + { + "entropy": 0.05184478218172444, + "epoch": 1.8761216076192155, + "grad_norm": 0.83203125, + "learning_rate": 6.502135043328099e-05, + "loss": 0.0703523302078247, + "mean_token_accuracy": 0.9819017976522446, + "num_tokens": 22467371.0, + "step": 21275 + }, + { + "entropy": 0.05208466888478142, + "epoch": 1.8783262417601798, + "grad_norm": 0.515625, + "learning_rate": 6.479847807278016e-05, + "loss": 0.07266618251800537, + "mean_token_accuracy": 0.9821174338459968, + "num_tokens": 22493871.0, + "step": 21300 + }, + { + "entropy": 0.046822893920980276, + "epoch": 1.8805308759011443, + "grad_norm": 1.6015625, + "learning_rate": 6.457580511883851e-05, + "loss": 0.06494226932525635, + "mean_token_accuracy": 0.981035427749157, + "num_tokens": 22520520.0, + "step": 21325 + }, + { + "entropy": 0.04288673719856888, + "epoch": 1.8827355100421084, + "grad_norm": 0.859375, + "learning_rate": 6.435333283283475e-05, + "loss": 0.05355540275573731, + "mean_token_accuracy": 0.9868204814195632, + "num_tokens": 22547233.0, + "step": 21350 + }, + { + "entropy": 0.04752451835520333, + "epoch": 1.8849401441830729, + "grad_norm": 1.015625, + "learning_rate": 6.413106247501069e-05, + "loss": 0.06815120220184326, + "mean_token_accuracy": 0.9843712577223778, + "num_tokens": 22572916.0, + "step": 21375 + }, + { + "entropy": 0.04865189905962325, + "epoch": 1.8871447783240372, + "grad_norm": 1.3125, + "learning_rate": 6.390899530446443e-05, + "loss": 0.07241939544677735, + "mean_token_accuracy": 0.9836265042424202, + "num_tokens": 22598245.0, + "step": 21400 + }, + { + "entropy": 0.05860245328833116, + "epoch": 1.8893494124650014, + "grad_norm": 2.375, + "learning_rate": 6.368713257914295e-05, + "loss": 0.07383595943450928, + "mean_token_accuracy": 0.9842527809739113, + "num_tokens": 22624338.0, + "step": 21425 + }, + { + "entropy": 0.05662819688994205, + "epoch": 1.8915540466059657, + "grad_norm": 2.0625, + "learning_rate": 6.346547555583526e-05, + "loss": 0.07304172039031982, + "mean_token_accuracy": 0.98199245095253, + "num_tokens": 22650941.0, + "step": 21450 + }, + { + "entropy": 0.04486946415730927, + "epoch": 1.89375868074693, + "grad_norm": 0.62109375, + "learning_rate": 6.324402549016493e-05, + "loss": 0.05117866992950439, + "mean_token_accuracy": 0.9867324560880661, + "num_tokens": 22676550.0, + "step": 21475 + }, + { + "entropy": 0.058360014182981104, + "epoch": 1.8959633148878945, + "grad_norm": 0.88671875, + "learning_rate": 6.302278363658337e-05, + "loss": 0.08039702415466309, + "mean_token_accuracy": 0.9807369062304496, + "num_tokens": 22704290.0, + "step": 21500 + }, + { + "entropy": 0.04482670632161898, + "epoch": 1.8981679490288585, + "grad_norm": 1.53125, + "learning_rate": 6.280175124836234e-05, + "loss": 0.047524452209472656, + "mean_token_accuracy": 0.9890332189202309, + "num_tokens": 22729939.0, + "step": 21525 + }, + { + "entropy": 0.04213360240697511, + "epoch": 1.900372583169823, + "grad_norm": 0.7265625, + "learning_rate": 6.258092957758727e-05, + "loss": 0.04635190963745117, + "mean_token_accuracy": 0.987784284055233, + "num_tokens": 22755060.0, + "step": 21550 + }, + { + "entropy": 0.04406625685893232, + "epoch": 1.9025772173107873, + "grad_norm": 1.65625, + "learning_rate": 6.236031987514968e-05, + "loss": 0.06420273780822754, + "mean_token_accuracy": 0.979720167517662, + "num_tokens": 22780955.0, + "step": 21575 + }, + { + "entropy": 0.048322958536518856, + "epoch": 1.9047818514517516, + "grad_norm": 1.28125, + "learning_rate": 6.213992339074052e-05, + "loss": 0.07283812046051025, + "mean_token_accuracy": 0.9831582528352737, + "num_tokens": 22807721.0, + "step": 21600 + }, + { + "entropy": 0.04930130613342044, + "epoch": 1.9069864855927159, + "grad_norm": 0.3125, + "learning_rate": 6.191974137284286e-05, + "loss": 0.0650008487701416, + "mean_token_accuracy": 0.987842877805233, + "num_tokens": 22833581.0, + "step": 21625 + }, + { + "entropy": 0.056969212190742836, + "epoch": 1.9091911197336802, + "grad_norm": 1.421875, + "learning_rate": 6.169977506872495e-05, + "loss": 0.07546248912811279, + "mean_token_accuracy": 0.9796876290440559, + "num_tokens": 22860516.0, + "step": 21650 + }, + { + "entropy": 0.049281568287333356, + "epoch": 1.9113957538746447, + "grad_norm": 1.1875, + "learning_rate": 6.148002572443293e-05, + "loss": 0.060806169509887695, + "mean_token_accuracy": 0.9853131911158561, + "num_tokens": 22887480.0, + "step": 21675 + }, + { + "entropy": 0.04275947823371098, + "epoch": 1.9136003880156087, + "grad_norm": 0.90234375, + "learning_rate": 6.126049458478406e-05, + "loss": 0.06027640342712402, + "mean_token_accuracy": 0.9874812626838684, + "num_tokens": 22913669.0, + "step": 21700 + }, + { + "entropy": 0.04580010384270281, + "epoch": 1.9158050221565732, + "grad_norm": 2.921875, + "learning_rate": 6.104118289335954e-05, + "loss": 0.07557197093963623, + "mean_token_accuracy": 0.9834654727578163, + "num_tokens": 22939277.0, + "step": 21725 + }, + { + "entropy": 0.053979095179820434, + "epoch": 1.9180096562975373, + "grad_norm": 2.546875, + "learning_rate": 6.082209189249737e-05, + "loss": 0.08138853073120117, + "mean_token_accuracy": 0.9819204857945443, + "num_tokens": 22966377.0, + "step": 21750 + }, + { + "entropy": 0.05803485995536903, + "epoch": 1.9202142904385018, + "grad_norm": 1.015625, + "learning_rate": 6.060322282328541e-05, + "loss": 0.07845424175262451, + "mean_token_accuracy": 0.9816239723563194, + "num_tokens": 22993866.0, + "step": 21775 + }, + { + "entropy": 0.047958312368718906, + "epoch": 1.922418924579466, + "grad_norm": 2.453125, + "learning_rate": 6.038457692555439e-05, + "loss": 0.059175658226013186, + "mean_token_accuracy": 0.9811293777823448, + "num_tokens": 23019960.0, + "step": 21800 + }, + { + "entropy": 0.05747593588210293, + "epoch": 1.9246235587204303, + "grad_norm": 2.125, + "learning_rate": 6.0166155437870874e-05, + "loss": 0.07036843299865722, + "mean_token_accuracy": 0.9825942468643188, + "num_tokens": 23047052.0, + "step": 21825 + }, + { + "entropy": 0.04418198180996114, + "epoch": 1.9268281928613946, + "grad_norm": 0.5078125, + "learning_rate": 5.994795959753011e-05, + "loss": 0.0574480676651001, + "mean_token_accuracy": 0.9849589946866035, + "num_tokens": 23071692.0, + "step": 21850 + }, + { + "entropy": 0.04961350689853134, + "epoch": 1.929032827002359, + "grad_norm": 1.203125, + "learning_rate": 5.9729990640549135e-05, + "loss": 0.06114984512329102, + "mean_token_accuracy": 0.9833905136585236, + "num_tokens": 23097756.0, + "step": 21875 + }, + { + "entropy": 0.046675025945150994, + "epoch": 1.9312374611433234, + "grad_norm": 0.375, + "learning_rate": 5.95122498016598e-05, + "loss": 0.06442777156829833, + "mean_token_accuracy": 0.9827769809961319, + "num_tokens": 23124268.0, + "step": 21900 + }, + { + "entropy": 0.05056103619572241, + "epoch": 1.9334420952842875, + "grad_norm": 0.8125, + "learning_rate": 5.9294738314301743e-05, + "loss": 0.06868845462799072, + "mean_token_accuracy": 0.9845385247468948, + "num_tokens": 23151215.0, + "step": 21925 + }, + { + "entropy": 0.048866855730157116, + "epoch": 1.935646729425252, + "grad_norm": 1.6171875, + "learning_rate": 5.907745741061534e-05, + "loss": 0.057186398506164554, + "mean_token_accuracy": 0.9831150618195533, + "num_tokens": 23177437.0, + "step": 21950 + }, + { + "entropy": 0.04560523602282046, + "epoch": 1.9378513635662162, + "grad_norm": 0.5703125, + "learning_rate": 5.8860408321434846e-05, + "loss": 0.061235876083374025, + "mean_token_accuracy": 0.9851277622580529, + "num_tokens": 23202778.0, + "step": 21975 + }, + { + "entropy": 0.04789327524813416, + "epoch": 1.9400559977071805, + "grad_norm": 1.984375, + "learning_rate": 5.864359227628122e-05, + "loss": 0.07066461086273193, + "mean_token_accuracy": 0.9854995140433311, + "num_tokens": 23228354.0, + "step": 22000 + }, + { + "epoch": 1.9400559977071805, + "eval_entropy": 0.020465548207560823, + "eval_loss": 0.024450432509183884, + "eval_mean_token_accuracy": 0.9920910437376308, + "eval_num_tokens": 23228354.0, + "eval_runtime": 257.3468, + "eval_samples_per_second": 15.267, + "eval_steps_per_second": 3.82, + "step": 22000 + }, + { + "entropy": 0.03955814203254704, + "epoch": 1.9422606318481448, + "grad_norm": 0.9453125, + "learning_rate": 5.842701050335543e-05, + "loss": 0.060781092643737794, + "mean_token_accuracy": 0.9853532475233078, + "num_tokens": 23254352.0, + "step": 22025 + }, + { + "entropy": 0.0578464501263079, + "epoch": 1.944465265989109, + "grad_norm": 0.380859375, + "learning_rate": 5.8210664229531295e-05, + "loss": 0.08736164093017579, + "mean_token_accuracy": 0.981940735578537, + "num_tokens": 23281917.0, + "step": 22050 + }, + { + "entropy": 0.05473690754020936, + "epoch": 1.9466699001300736, + "grad_norm": 1.4296875, + "learning_rate": 5.799455468034867e-05, + "loss": 0.08340877532958985, + "mean_token_accuracy": 0.9806632310152054, + "num_tokens": 23308935.0, + "step": 22075 + }, + { + "entropy": 0.06383058197628998, + "epoch": 1.9488745342710376, + "grad_norm": 0.306640625, + "learning_rate": 5.777868308000629e-05, + "loss": 0.0989750099182129, + "mean_token_accuracy": 0.9789142432808876, + "num_tokens": 23336067.0, + "step": 22100 + }, + { + "entropy": 0.04984018235598341, + "epoch": 1.9510791684120021, + "grad_norm": 1.0546875, + "learning_rate": 5.7563050651355144e-05, + "loss": 0.06965320110321045, + "mean_token_accuracy": 0.9844454318284989, + "num_tokens": 23362163.0, + "step": 22125 + }, + { + "entropy": 0.05287686903800932, + "epoch": 1.9532838025529662, + "grad_norm": 1.1328125, + "learning_rate": 5.73476586158912e-05, + "loss": 0.05809790134429932, + "mean_token_accuracy": 0.9823646634817124, + "num_tokens": 23387801.0, + "step": 22150 + }, + { + "entropy": 0.04915350146053243, + "epoch": 1.9554884366939307, + "grad_norm": 1.03125, + "learning_rate": 5.713250819374888e-05, + "loss": 0.06481000423431396, + "mean_token_accuracy": 0.9831967288255692, + "num_tokens": 23413992.0, + "step": 22175 + }, + { + "entropy": 0.04388614729527035, + "epoch": 1.957693070834895, + "grad_norm": 0.79296875, + "learning_rate": 5.691760060369372e-05, + "loss": 0.057064995765686036, + "mean_token_accuracy": 0.9855844187736511, + "num_tokens": 23440892.0, + "step": 22200 + }, + { + "entropy": 0.04378307052866148, + "epoch": 1.9598977049758592, + "grad_norm": 2.71875, + "learning_rate": 5.6702937063115844e-05, + "loss": 0.06034691333770752, + "mean_token_accuracy": 0.9812196576595307, + "num_tokens": 23468946.0, + "step": 22225 + }, + { + "entropy": 0.042295570675632915, + "epoch": 1.9621023391168235, + "grad_norm": 0.828125, + "learning_rate": 5.6488518788022834e-05, + "loss": 0.056632590293884275, + "mean_token_accuracy": 0.9866640722751617, + "num_tokens": 23494432.0, + "step": 22250 + }, + { + "entropy": 0.05475355692382436, + "epoch": 1.9643069732577878, + "grad_norm": 2.515625, + "learning_rate": 5.627434699303296e-05, + "loss": 0.08935759544372558, + "mean_token_accuracy": 0.9836769181489945, + "num_tokens": 23521590.0, + "step": 22275 + }, + { + "entropy": 0.04196426347873057, + "epoch": 1.9665116073987523, + "grad_norm": 0.251953125, + "learning_rate": 5.606042289136816e-05, + "loss": 0.05499490737915039, + "mean_token_accuracy": 0.9883558192849159, + "num_tokens": 23547518.0, + "step": 22300 + }, + { + "entropy": 0.049566179718676724, + "epoch": 1.9687162415397164, + "grad_norm": 1.4296875, + "learning_rate": 5.584674769484735e-05, + "loss": 0.06642638683319092, + "mean_token_accuracy": 0.9826064822077751, + "num_tokens": 23573365.0, + "step": 22325 + }, + { + "entropy": 0.046303658723190894, + "epoch": 1.9709208756806809, + "grad_norm": 1.765625, + "learning_rate": 5.563332261387946e-05, + "loss": 0.0545860767364502, + "mean_token_accuracy": 0.9867219117283821, + "num_tokens": 23600079.0, + "step": 22350 + }, + { + "entropy": 0.05276435214473167, + "epoch": 1.9731255098216451, + "grad_norm": 2.15625, + "learning_rate": 5.542014885745654e-05, + "loss": 0.07084882259368896, + "mean_token_accuracy": 0.9833718439936638, + "num_tokens": 23627130.0, + "step": 22375 + }, + { + "entropy": 0.0445670667073864, + "epoch": 1.9753301439626094, + "grad_norm": 1.640625, + "learning_rate": 5.520722763314694e-05, + "loss": 0.06202894687652588, + "mean_token_accuracy": 0.9840847709774971, + "num_tokens": 23653384.0, + "step": 22400 + }, + { + "entropy": 0.0423390647029737, + "epoch": 1.9775347781035737, + "grad_norm": 0.380859375, + "learning_rate": 5.4994560147088594e-05, + "loss": 0.06795649528503418, + "mean_token_accuracy": 0.9833470878005027, + "num_tokens": 23679568.0, + "step": 22425 + }, + { + "entropy": 0.05557289769756608, + "epoch": 1.979739412244538, + "grad_norm": 1.6171875, + "learning_rate": 5.4782147603981993e-05, + "loss": 0.07008291244506835, + "mean_token_accuracy": 0.9816686421632767, + "num_tokens": 23706863.0, + "step": 22450 + }, + { + "entropy": 0.058572328451264186, + "epoch": 1.9819440463855025, + "grad_norm": 0.58203125, + "learning_rate": 5.4569991207083506e-05, + "loss": 0.0892257308959961, + "mean_token_accuracy": 0.981511018872261, + "num_tokens": 23734937.0, + "step": 22475 + }, + { + "entropy": 0.05591572977726173, + "epoch": 1.9841486805264665, + "grad_norm": 1.25, + "learning_rate": 5.435809215819843e-05, + "loss": 0.08323918342590332, + "mean_token_accuracy": 0.981179955303669, + "num_tokens": 23760727.0, + "step": 22500 + }, + { + "entropy": 0.048877669227076696, + "epoch": 1.986353314667431, + "grad_norm": 1.90625, + "learning_rate": 5.4146451657674314e-05, + "loss": 0.05748679637908936, + "mean_token_accuracy": 0.986055271923542, + "num_tokens": 23788078.0, + "step": 22525 + }, + { + "entropy": 0.03823746287758695, + "epoch": 1.988557948808395, + "grad_norm": 2.21875, + "learning_rate": 5.393507090439409e-05, + "loss": 0.05168859958648682, + "mean_token_accuracy": 0.9883771386742591, + "num_tokens": 23813096.0, + "step": 22550 + }, + { + "entropy": 0.053251380213696395, + "epoch": 1.9907625829493596, + "grad_norm": 1.59375, + "learning_rate": 5.3723951095769376e-05, + "loss": 0.0701138734817505, + "mean_token_accuracy": 0.9776954674720764, + "num_tokens": 23840476.0, + "step": 22575 + }, + { + "entropy": 0.04151842765553738, + "epoch": 1.9929672170903239, + "grad_norm": 2.453125, + "learning_rate": 5.351309342773347e-05, + "loss": 0.05058634757995605, + "mean_token_accuracy": 0.9842834863066673, + "num_tokens": 23866669.0, + "step": 22600 + }, + { + "entropy": 0.04889295605695224, + "epoch": 1.9951718512312882, + "grad_norm": 2.53125, + "learning_rate": 5.330249909473477e-05, + "loss": 0.062028875350952146, + "mean_token_accuracy": 0.9804515436291694, + "num_tokens": 23893348.0, + "step": 22625 + }, + { + "entropy": 0.04262204033504531, + "epoch": 1.9973764853722524, + "grad_norm": 0.80078125, + "learning_rate": 5.309216928973e-05, + "loss": 0.05334056854248047, + "mean_token_accuracy": 0.9852747458219528, + "num_tokens": 23920789.0, + "step": 22650 + }, + { + "entropy": 0.06502369482783252, + "epoch": 1.9995811195132167, + "grad_norm": 2.5625, + "learning_rate": 5.28821052041774e-05, + "loss": 0.10824305534362794, + "mean_token_accuracy": 0.9763033568859101, + "num_tokens": 23947393.0, + "step": 22675 + }, + { + "entropy": 0.04334155930002747, + "epoch": 2.0017637073127714, + "grad_norm": 1.4296875, + "learning_rate": 5.2672308028029915e-05, + "loss": 0.03947641611099243, + "mean_token_accuracy": 0.9899958405229781, + "num_tokens": 23973397.0, + "step": 22700 + }, + { + "entropy": 0.036623261850108974, + "epoch": 2.003968341453736, + "grad_norm": 0.82421875, + "learning_rate": 5.246277894972862e-05, + "loss": 0.0373115611076355, + "mean_token_accuracy": 0.9904880458116532, + "num_tokens": 23999702.0, + "step": 22725 + }, + { + "entropy": 0.03956556206168898, + "epoch": 2.0061729755947, + "grad_norm": 0.80859375, + "learning_rate": 5.225351915619583e-05, + "loss": 0.030584862232208253, + "mean_token_accuracy": 0.9897949555516243, + "num_tokens": 24025597.0, + "step": 22750 + }, + { + "entropy": 0.040053981658929844, + "epoch": 2.0083776097356645, + "grad_norm": 1.1953125, + "learning_rate": 5.20445298328285e-05, + "loss": 0.04506603240966797, + "mean_token_accuracy": 0.9884122493863106, + "num_tokens": 24051938.0, + "step": 22775 + }, + { + "entropy": 0.03691858461916127, + "epoch": 2.0105822438766285, + "grad_norm": 0.71484375, + "learning_rate": 5.183581216349134e-05, + "loss": 0.030272905826568604, + "mean_token_accuracy": 0.9916832426190376, + "num_tokens": 24077647.0, + "step": 22800 + }, + { + "entropy": 0.04003718716594449, + "epoch": 2.012786878017593, + "grad_norm": 0.5546875, + "learning_rate": 5.162736733051035e-05, + "loss": 0.039083366394042966, + "mean_token_accuracy": 0.9893439787626267, + "num_tokens": 24104616.0, + "step": 22825 + }, + { + "entropy": 0.03008637696853839, + "epoch": 2.014991512158557, + "grad_norm": 3.09375, + "learning_rate": 5.1419196514665845e-05, + "loss": 0.028565278053283693, + "mean_token_accuracy": 0.9929461520910263, + "num_tokens": 24131459.0, + "step": 22850 + }, + { + "entropy": 0.037699917218196786, + "epoch": 2.0171961462995216, + "grad_norm": 0.984375, + "learning_rate": 5.121130089518609e-05, + "loss": 0.03901580572128296, + "mean_token_accuracy": 0.990917377769947, + "num_tokens": 24159204.0, + "step": 22875 + }, + { + "entropy": 0.032397389931247744, + "epoch": 2.019400780440486, + "grad_norm": 0.2236328125, + "learning_rate": 5.100368164974023e-05, + "loss": 0.03438486814498901, + "mean_token_accuracy": 0.991854096353054, + "num_tokens": 24186110.0, + "step": 22900 + }, + { + "entropy": 0.03398451173638023, + "epoch": 2.02160541458145, + "grad_norm": 0.408203125, + "learning_rate": 5.0796339954432e-05, + "loss": 0.033078410625457765, + "mean_token_accuracy": 0.9912483549118042, + "num_tokens": 24212478.0, + "step": 22925 + }, + { + "entropy": 0.03318491613783408, + "epoch": 2.0238100487224147, + "grad_norm": 1.2265625, + "learning_rate": 5.0589276983792835e-05, + "loss": 0.029813830852508546, + "mean_token_accuracy": 0.9912602853775024, + "num_tokens": 24239204.0, + "step": 22950 + }, + { + "entropy": 0.03395519398247416, + "epoch": 2.0260146828633787, + "grad_norm": 0.26171875, + "learning_rate": 5.0382493910775275e-05, + "loss": 0.04194780826568603, + "mean_token_accuracy": 0.9895057672262192, + "num_tokens": 24265682.0, + "step": 22975 + }, + { + "entropy": 0.032896046323221524, + "epoch": 2.028219317004343, + "grad_norm": 2.375, + "learning_rate": 5.0175991906746335e-05, + "loss": 0.04431535720825195, + "mean_token_accuracy": 0.9895771262049675, + "num_tokens": 24292892.0, + "step": 23000 + }, + { + "epoch": 2.028219317004343, + "eval_entropy": 0.01695111796384467, + "eval_loss": 0.024288112297654152, + "eval_mean_token_accuracy": 0.9923011344665186, + "eval_num_tokens": 24292892.0, + "eval_runtime": 267.7927, + "eval_samples_per_second": 14.672, + "eval_steps_per_second": 3.671, + "step": 23000 + }, + { + "entropy": 0.030041932204985643, + "epoch": 2.0304239511453073, + "grad_norm": 0.39453125, + "learning_rate": 4.996977214148083e-05, + "loss": 0.04025341033935547, + "mean_token_accuracy": 0.9895694407820702, + "num_tokens": 24319322.0, + "step": 23025 + }, + { + "entropy": 0.03509666084595665, + "epoch": 2.0326285852862718, + "grad_norm": 3.109375, + "learning_rate": 4.976383578315473e-05, + "loss": 0.04548058986663819, + "mean_token_accuracy": 0.9889927119016647, + "num_tokens": 24346498.0, + "step": 23050 + }, + { + "entropy": 0.026039416307394276, + "epoch": 2.0348332194272363, + "grad_norm": 1.0546875, + "learning_rate": 4.955818399833868e-05, + "loss": 0.026256818771362305, + "mean_token_accuracy": 0.992972640991211, + "num_tokens": 24370667.0, + "step": 23075 + }, + { + "entropy": 0.028871810426353478, + "epoch": 2.0370378535682003, + "grad_norm": 0.3984375, + "learning_rate": 4.935281795199128e-05, + "loss": 0.034981093406677245, + "mean_token_accuracy": 0.9913325354456901, + "num_tokens": 24396592.0, + "step": 23100 + }, + { + "entropy": 0.028396220860795438, + "epoch": 2.039242487709165, + "grad_norm": 0.306640625, + "learning_rate": 4.914773880745241e-05, + "loss": 0.03216857671737671, + "mean_token_accuracy": 0.9912496462464333, + "num_tokens": 24423211.0, + "step": 23125 + }, + { + "entropy": 0.028173443265914103, + "epoch": 2.041447121850129, + "grad_norm": 1.21875, + "learning_rate": 4.894294772643684e-05, + "loss": 0.032784721851348876, + "mean_token_accuracy": 0.9923763892054558, + "num_tokens": 24449053.0, + "step": 23150 + }, + { + "entropy": 0.0323675285151694, + "epoch": 2.0436517559910934, + "grad_norm": 0.73828125, + "learning_rate": 4.87384458690275e-05, + "loss": 0.03548475980758667, + "mean_token_accuracy": 0.9910583171248436, + "num_tokens": 24476450.0, + "step": 23175 + }, + { + "entropy": 0.029555844330679974, + "epoch": 2.0458563901320574, + "grad_norm": 0.9609375, + "learning_rate": 4.8534234393669e-05, + "loss": 0.03878538608551026, + "mean_token_accuracy": 0.9860998558998108, + "num_tokens": 24502630.0, + "step": 23200 + }, + { + "entropy": 0.025517641233018366, + "epoch": 2.048061024273022, + "grad_norm": 1.890625, + "learning_rate": 4.8330314457160885e-05, + "loss": 0.029563398361206056, + "mean_token_accuracy": 0.9919531518220901, + "num_tokens": 24528314.0, + "step": 23225 + }, + { + "entropy": 0.02861691084319318, + "epoch": 2.050265658413986, + "grad_norm": 1.265625, + "learning_rate": 4.81266872146514e-05, + "loss": 0.03183458566665649, + "mean_token_accuracy": 0.992757821381092, + "num_tokens": 24555267.0, + "step": 23250 + }, + { + "entropy": 0.02771842215915967, + "epoch": 2.0524702925549505, + "grad_norm": 1.078125, + "learning_rate": 4.7923353819630565e-05, + "loss": 0.030379624366760255, + "mean_token_accuracy": 0.9919179257750511, + "num_tokens": 24581529.0, + "step": 23275 + }, + { + "entropy": 0.03345744485504838, + "epoch": 2.054674926695915, + "grad_norm": 0.099609375, + "learning_rate": 4.7720315423924024e-05, + "loss": 0.0444830846786499, + "mean_token_accuracy": 0.9900724491477013, + "num_tokens": 24608673.0, + "step": 23300 + }, + { + "entropy": 0.029868014961575682, + "epoch": 2.056879560836879, + "grad_norm": 1.0546875, + "learning_rate": 4.751757317768618e-05, + "loss": 0.04446365356445312, + "mean_token_accuracy": 0.989599525630474, + "num_tokens": 24634789.0, + "step": 23325 + }, + { + "entropy": 0.031795623505458934, + "epoch": 2.0590841949778436, + "grad_norm": 1.40625, + "learning_rate": 4.7315128229393944e-05, + "loss": 0.037621107101440426, + "mean_token_accuracy": 0.9905502396821976, + "num_tokens": 24660928.0, + "step": 23350 + }, + { + "entropy": 0.03209367050054425, + "epoch": 2.0612888291188076, + "grad_norm": 0.462890625, + "learning_rate": 4.7112981725840065e-05, + "loss": 0.031616337299346924, + "mean_token_accuracy": 0.9918192434310913, + "num_tokens": 24686594.0, + "step": 23375 + }, + { + "entropy": 0.035774395614826066, + "epoch": 2.063493463259772, + "grad_norm": 0.5078125, + "learning_rate": 4.6911134812126745e-05, + "loss": 0.04031749248504639, + "mean_token_accuracy": 0.9876101958751679, + "num_tokens": 24713766.0, + "step": 23400 + }, + { + "entropy": 0.024187013619557548, + "epoch": 2.065698097400736, + "grad_norm": 0.271484375, + "learning_rate": 4.6709588631658975e-05, + "loss": 0.029099743366241455, + "mean_token_accuracy": 0.992880313694477, + "num_tokens": 24739687.0, + "step": 23425 + }, + { + "entropy": 0.02847926665253908, + "epoch": 2.0679027315417007, + "grad_norm": 0.703125, + "learning_rate": 4.650834432613829e-05, + "loss": 0.03781245231628418, + "mean_token_accuracy": 0.991992236673832, + "num_tokens": 24765307.0, + "step": 23450 + }, + { + "entropy": 0.025070481478396685, + "epoch": 2.0701073656826647, + "grad_norm": 0.859375, + "learning_rate": 4.63074030355562e-05, + "loss": 0.02944427490234375, + "mean_token_accuracy": 0.9928121709823609, + "num_tokens": 24790905.0, + "step": 23475 + }, + { + "entropy": 0.03197751844145387, + "epoch": 2.0723119998236292, + "grad_norm": 1.015625, + "learning_rate": 4.610676589818763e-05, + "loss": 0.031009118556976318, + "mean_token_accuracy": 0.9915920281410218, + "num_tokens": 24816043.0, + "step": 23500 + }, + { + "entropy": 0.02905171564543707, + "epoch": 2.0745166339645937, + "grad_norm": 2.1875, + "learning_rate": 4.590643405058458e-05, + "loss": 0.031951904296875, + "mean_token_accuracy": 0.9911505779623986, + "num_tokens": 24842440.0, + "step": 23525 + }, + { + "entropy": 0.02454348152547027, + "epoch": 2.076721268105558, + "grad_norm": 0.55078125, + "learning_rate": 4.570640862756973e-05, + "loss": 0.029993116855621338, + "mean_token_accuracy": 0.9923299300670624, + "num_tokens": 24868553.0, + "step": 23550 + }, + { + "entropy": 0.027761456568005088, + "epoch": 2.0789259022465223, + "grad_norm": 0.82421875, + "learning_rate": 4.550669076222992e-05, + "loss": 0.03329916954040527, + "mean_token_accuracy": 0.9906358134746551, + "num_tokens": 24894867.0, + "step": 23575 + }, + { + "entropy": 0.03804819417771796, + "epoch": 2.0811305363874864, + "grad_norm": 1.9453125, + "learning_rate": 4.530728158590982e-05, + "loss": 0.04649081707000732, + "mean_token_accuracy": 0.9858838346600532, + "num_tokens": 24922116.0, + "step": 23600 + }, + { + "entropy": 0.029030120181087114, + "epoch": 2.083335170528451, + "grad_norm": 0.455078125, + "learning_rate": 4.510818222820533e-05, + "loss": 0.03388277530670166, + "mean_token_accuracy": 0.992407936155796, + "num_tokens": 24948516.0, + "step": 23625 + }, + { + "entropy": 0.02953952599247714, + "epoch": 2.085539804669415, + "grad_norm": 0.734375, + "learning_rate": 4.490939381695743e-05, + "loss": 0.03897066593170166, + "mean_token_accuracy": 0.990988989174366, + "num_tokens": 24974732.0, + "step": 23650 + }, + { + "entropy": 0.030306367558223427, + "epoch": 2.0877444388103794, + "grad_norm": 0.6875, + "learning_rate": 4.471091747824566e-05, + "loss": 0.03767595291137695, + "mean_token_accuracy": 0.9907210981845855, + "num_tokens": 25001192.0, + "step": 23675 + }, + { + "entropy": 0.03219981058784469, + "epoch": 2.089949072951344, + "grad_norm": 0.474609375, + "learning_rate": 4.451275433638176e-05, + "loss": 0.03162062644958496, + "mean_token_accuracy": 0.991564072072506, + "num_tokens": 25026352.0, + "step": 23700 + }, + { + "entropy": 0.02597928559704087, + "epoch": 2.092153707092308, + "grad_norm": 0.259765625, + "learning_rate": 4.431490551390326e-05, + "loss": 0.03252000331878662, + "mean_token_accuracy": 0.9911320436000824, + "num_tokens": 25052073.0, + "step": 23725 + }, + { + "entropy": 0.032203564375049606, + "epoch": 2.0943583412332725, + "grad_norm": 0.55078125, + "learning_rate": 4.4117372131567136e-05, + "loss": 0.04490713119506836, + "mean_token_accuracy": 0.9891028618812561, + "num_tokens": 25077135.0, + "step": 23750 + }, + { + "entropy": 0.028571351964310453, + "epoch": 2.0965629753742365, + "grad_norm": 0.265625, + "learning_rate": 4.392015530834358e-05, + "loss": 0.02833454132080078, + "mean_token_accuracy": 0.992771886587143, + "num_tokens": 25104636.0, + "step": 23775 + }, + { + "entropy": 0.03307085748732788, + "epoch": 2.098767609515201, + "grad_norm": 1.3046875, + "learning_rate": 4.372325616140954e-05, + "loss": 0.0332148814201355, + "mean_token_accuracy": 0.9900698167085648, + "num_tokens": 25132472.0, + "step": 23800 + }, + { + "entropy": 0.030112269805176766, + "epoch": 2.100972243656165, + "grad_norm": 0.5390625, + "learning_rate": 4.352667580614236e-05, + "loss": 0.03317270040512085, + "mean_token_accuracy": 0.9916463688015937, + "num_tokens": 25158834.0, + "step": 23825 + }, + { + "entropy": 0.031288266634292086, + "epoch": 2.1031768777971296, + "grad_norm": 4.5, + "learning_rate": 4.3330415356113565e-05, + "loss": 0.03746793270111084, + "mean_token_accuracy": 0.9905204650759697, + "num_tokens": 25186663.0, + "step": 23850 + }, + { + "entropy": 0.030189171290840023, + "epoch": 2.105381511938094, + "grad_norm": 0.373046875, + "learning_rate": 4.313447592308251e-05, + "loss": 0.03885282039642334, + "mean_token_accuracy": 0.9899864155054092, + "num_tokens": 25213550.0, + "step": 23875 + }, + { + "entropy": 0.026368075483023858, + "epoch": 2.107586146079058, + "grad_norm": 0.5625, + "learning_rate": 4.293885861699011e-05, + "loss": 0.026547930240631103, + "mean_token_accuracy": 0.9932371464371681, + "num_tokens": 25238404.0, + "step": 23900 + }, + { + "entropy": 0.03681902970789452, + "epoch": 2.1097907802200226, + "grad_norm": 0.81640625, + "learning_rate": 4.2743564545952406e-05, + "loss": 0.04580854892730713, + "mean_token_accuracy": 0.9896917739510536, + "num_tokens": 25265556.0, + "step": 23925 + }, + { + "entropy": 0.026895671052625404, + "epoch": 2.1119954143609867, + "grad_norm": 0.16015625, + "learning_rate": 4.2548594816254573e-05, + "loss": 0.030972492694854737, + "mean_token_accuracy": 0.9919655025005341, + "num_tokens": 25292155.0, + "step": 23950 + }, + { + "entropy": 0.0291110172522167, + "epoch": 2.114200048501951, + "grad_norm": 0.361328125, + "learning_rate": 4.235395053234431e-05, + "loss": 0.03769398212432861, + "mean_token_accuracy": 0.9900598734617233, + "num_tokens": 25318586.0, + "step": 23975 + }, + { + "entropy": 0.023451846447514982, + "epoch": 2.1164046826429153, + "grad_norm": 1.1328125, + "learning_rate": 4.21596327968259e-05, + "loss": 0.0197164249420166, + "mean_token_accuracy": 0.9938502493500709, + "num_tokens": 25343799.0, + "step": 24000 + }, + { + "epoch": 2.1164046826429153, + "eval_entropy": 0.01599665907487422, + "eval_loss": 0.02462713047862053, + "eval_mean_token_accuracy": 0.9924976906965611, + "eval_num_tokens": 25343799.0, + "eval_runtime": 235.5413, + "eval_samples_per_second": 16.681, + "eval_steps_per_second": 4.173, + "step": 24000 + }, + { + "entropy": 0.03335717603546073, + "epoch": 2.1186093167838798, + "grad_norm": 4.46875, + "learning_rate": 4.196564271045379e-05, + "loss": 0.0459407377243042, + "mean_token_accuracy": 0.9879160994291305, + "num_tokens": 25371576.0, + "step": 24025 + }, + { + "entropy": 0.028599485470440413, + "epoch": 2.120813950924844, + "grad_norm": 2.578125, + "learning_rate": 4.177198137212629e-05, + "loss": 0.03392246723175049, + "mean_token_accuracy": 0.9923342031240463, + "num_tokens": 25398046.0, + "step": 24050 + }, + { + "entropy": 0.030389589803635316, + "epoch": 2.1230185850658083, + "grad_norm": 0.95703125, + "learning_rate": 4.157864987887957e-05, + "loss": 0.032247264385223386, + "mean_token_accuracy": 0.992058128118515, + "num_tokens": 25424355.0, + "step": 24075 + }, + { + "entropy": 0.0266408162328662, + "epoch": 2.125223219206773, + "grad_norm": 1.53125, + "learning_rate": 4.138564932588126e-05, + "loss": 0.0338702917098999, + "mean_token_accuracy": 0.9901499152183533, + "num_tokens": 25450854.0, + "step": 24100 + }, + { + "entropy": 0.02991066069038425, + "epoch": 2.127427853347737, + "grad_norm": 0.58203125, + "learning_rate": 4.1192980806424374e-05, + "loss": 0.036460573673248294, + "mean_token_accuracy": 0.9879888358712197, + "num_tokens": 25476457.0, + "step": 24125 + }, + { + "entropy": 0.029010038101114334, + "epoch": 2.1296324874887014, + "grad_norm": 0.6796875, + "learning_rate": 4.100064541192092e-05, + "loss": 0.030922062397003174, + "mean_token_accuracy": 0.9916366341710091, + "num_tokens": 25502303.0, + "step": 24150 + }, + { + "entropy": 0.03539934004222232, + "epoch": 2.1318371216296654, + "grad_norm": 3.765625, + "learning_rate": 4.080864423189601e-05, + "loss": 0.04334574222564697, + "mean_token_accuracy": 0.9894258263707161, + "num_tokens": 25528695.0, + "step": 24175 + }, + { + "entropy": 0.03038750326306399, + "epoch": 2.13404175577063, + "grad_norm": 0.259765625, + "learning_rate": 4.061697835398136e-05, + "loss": 0.03719171047210693, + "mean_token_accuracy": 0.989592821598053, + "num_tokens": 25555501.0, + "step": 24200 + }, + { + "entropy": 0.027374957603205984, + "epoch": 2.136246389911594, + "grad_norm": 0.515625, + "learning_rate": 4.042564886390946e-05, + "loss": 0.02741792917251587, + "mean_token_accuracy": 0.9915632554888725, + "num_tokens": 25582365.0, + "step": 24225 + }, + { + "entropy": 0.03261903412229003, + "epoch": 2.1384510240525585, + "grad_norm": 0.427734375, + "learning_rate": 4.023465684550709e-05, + "loss": 0.03499001502990723, + "mean_token_accuracy": 0.9893913465738297, + "num_tokens": 25608433.0, + "step": 24250 + }, + { + "entropy": 0.03213590253819348, + "epoch": 2.140655658193523, + "grad_norm": 1.0703125, + "learning_rate": 4.00440033806895e-05, + "loss": 0.050077261924743655, + "mean_token_accuracy": 0.9888369315862655, + "num_tokens": 25636119.0, + "step": 24275 + }, + { + "entropy": 0.03385305298801541, + "epoch": 2.142860292334487, + "grad_norm": 1.515625, + "learning_rate": 3.985368954945404e-05, + "loss": 0.04040426731109619, + "mean_token_accuracy": 0.9899595540761947, + "num_tokens": 25662083.0, + "step": 24300 + }, + { + "entropy": 0.028466418773750776, + "epoch": 2.1450649264754515, + "grad_norm": 0.71875, + "learning_rate": 3.966371642987423e-05, + "loss": 0.03010768175125122, + "mean_token_accuracy": 0.9902914983034133, + "num_tokens": 25688871.0, + "step": 24325 + }, + { + "entropy": 0.02227822571494471, + "epoch": 2.1472695606164156, + "grad_norm": 2.109375, + "learning_rate": 3.9474085098093396e-05, + "loss": 0.02533080816268921, + "mean_token_accuracy": 0.9931004998087883, + "num_tokens": 25715189.0, + "step": 24350 + }, + { + "entropy": 0.03268660336951143, + "epoch": 2.14947419475738, + "grad_norm": 0.5078125, + "learning_rate": 3.928479662831885e-05, + "loss": 0.046119885444641115, + "mean_token_accuracy": 0.9887044957280159, + "num_tokens": 25741862.0, + "step": 24375 + }, + { + "entropy": 0.02860573635873152, + "epoch": 2.151678828898344, + "grad_norm": 1.8203125, + "learning_rate": 3.909585209281573e-05, + "loss": 0.03543957471847534, + "mean_token_accuracy": 0.9902654913067818, + "num_tokens": 25767683.0, + "step": 24400 + }, + { + "entropy": 0.02657259469760902, + "epoch": 2.1538834630393087, + "grad_norm": 2.03125, + "learning_rate": 3.8907252561900774e-05, + "loss": 0.03090388059616089, + "mean_token_accuracy": 0.9917852020263672, + "num_tokens": 25793941.0, + "step": 24425 + }, + { + "entropy": 0.02819765280069987, + "epoch": 2.156088097180273, + "grad_norm": 1.25, + "learning_rate": 3.871899910393636e-05, + "loss": 0.032193429470062256, + "mean_token_accuracy": 0.9921908810734749, + "num_tokens": 25820598.0, + "step": 24450 + }, + { + "entropy": 0.028375080437945142, + "epoch": 2.1582927313212372, + "grad_norm": 0.9921875, + "learning_rate": 3.853109278532456e-05, + "loss": 0.034534347057342527, + "mean_token_accuracy": 0.9907797083258629, + "num_tokens": 25846699.0, + "step": 24475 + }, + { + "entropy": 0.033441547485308545, + "epoch": 2.1604973654622017, + "grad_norm": 0.85546875, + "learning_rate": 3.834353467050096e-05, + "loss": 0.03380261182785034, + "mean_token_accuracy": 0.9893592429161072, + "num_tokens": 25873526.0, + "step": 24500 + }, + { + "entropy": 0.028523832484534068, + "epoch": 2.162701999603166, + "grad_norm": 0.7109375, + "learning_rate": 3.8156325821928694e-05, + "loss": 0.03483229398727417, + "mean_token_accuracy": 0.9894470557570457, + "num_tokens": 25899950.0, + "step": 24525 + }, + { + "entropy": 0.02734538020009495, + "epoch": 2.1649066337441303, + "grad_norm": 2.328125, + "learning_rate": 3.796946730009232e-05, + "loss": 0.03180067300796509, + "mean_token_accuracy": 0.9929764324426651, + "num_tokens": 25926135.0, + "step": 24550 + }, + { + "entropy": 0.02623340411500976, + "epoch": 2.1671112678850943, + "grad_norm": 0.65234375, + "learning_rate": 3.778296016349195e-05, + "loss": 0.036533083915710446, + "mean_token_accuracy": 0.9917810225486755, + "num_tokens": 25951889.0, + "step": 24575 + }, + { + "entropy": 0.030956792831748316, + "epoch": 2.169315902026059, + "grad_norm": 0.49609375, + "learning_rate": 3.759680546863724e-05, + "loss": 0.033663554191589354, + "mean_token_accuracy": 0.990970625281334, + "num_tokens": 25977911.0, + "step": 24600 + }, + { + "entropy": 0.025135348588664782, + "epoch": 2.171520536167023, + "grad_norm": 0.59375, + "learning_rate": 3.7411004270041336e-05, + "loss": 0.02640949010848999, + "mean_token_accuracy": 0.9934472215175628, + "num_tokens": 26004056.0, + "step": 24625 + }, + { + "entropy": 0.026295318936572585, + "epoch": 2.1737251703079874, + "grad_norm": 0.11376953125, + "learning_rate": 3.722555762021489e-05, + "loss": 0.03466712474822998, + "mean_token_accuracy": 0.992102455496788, + "num_tokens": 26030865.0, + "step": 24650 + }, + { + "entropy": 0.025986581510624093, + "epoch": 2.175929804448952, + "grad_norm": 0.10400390625, + "learning_rate": 3.7040466569660115e-05, + "loss": 0.02724365234375, + "mean_token_accuracy": 0.9930882236361503, + "num_tokens": 26056914.0, + "step": 24675 + }, + { + "entropy": 0.03014488375159999, + "epoch": 2.178134438589916, + "grad_norm": 1.171875, + "learning_rate": 3.685573216686494e-05, + "loss": 0.03400787115097046, + "mean_token_accuracy": 0.9910931748151779, + "num_tokens": 26082253.0, + "step": 24700 + }, + { + "entropy": 0.04059045683388831, + "epoch": 2.1803390727308805, + "grad_norm": 0.6015625, + "learning_rate": 3.6671355458296994e-05, + "loss": 0.05361992835998535, + "mean_token_accuracy": 0.9899610930681229, + "num_tokens": 26109736.0, + "step": 24725 + }, + { + "entropy": 0.021792225182471158, + "epoch": 2.1825437068718445, + "grad_norm": 0.1630859375, + "learning_rate": 3.648733748839756e-05, + "loss": 0.024166872501373293, + "mean_token_accuracy": 0.9929477843642235, + "num_tokens": 26134972.0, + "step": 24750 + }, + { + "entropy": 0.02890670511445933, + "epoch": 2.184748341012809, + "grad_norm": 1.0, + "learning_rate": 3.6303679299575853e-05, + "loss": 0.037716662883758544, + "mean_token_accuracy": 0.9917557543516159, + "num_tokens": 26162082.0, + "step": 24775 + }, + { + "entropy": 0.03442299145397556, + "epoch": 2.186952975153773, + "grad_norm": 1.8828125, + "learning_rate": 3.612038193220302e-05, + "loss": 0.0406611967086792, + "mean_token_accuracy": 0.9893470558524132, + "num_tokens": 26189379.0, + "step": 24800 + }, + { + "entropy": 0.030212515026360052, + "epoch": 2.1891576092947376, + "grad_norm": 0.388671875, + "learning_rate": 3.593744642460629e-05, + "loss": 0.02927395820617676, + "mean_token_accuracy": 0.9920516067743301, + "num_tokens": 26215772.0, + "step": 24825 + }, + { + "entropy": 0.037156562910895446, + "epoch": 2.1913622434357016, + "grad_norm": 1.984375, + "learning_rate": 3.575487381306296e-05, + "loss": 0.051420702934265136, + "mean_token_accuracy": 0.9874176776409149, + "num_tokens": 26243037.0, + "step": 24850 + }, + { + "entropy": 0.028511690973427904, + "epoch": 2.193566877576666, + "grad_norm": 0.08984375, + "learning_rate": 3.557266513179474e-05, + "loss": 0.04477667808532715, + "mean_token_accuracy": 0.9889134570956231, + "num_tokens": 26269706.0, + "step": 24875 + }, + { + "entropy": 0.03325704227485403, + "epoch": 2.1957715117176306, + "grad_norm": 1.2734375, + "learning_rate": 3.539082141296164e-05, + "loss": 0.03619353771209717, + "mean_token_accuracy": 0.9900521844625473, + "num_tokens": 26296027.0, + "step": 24900 + }, + { + "entropy": 0.03061635433907213, + "epoch": 2.1979761458585947, + "grad_norm": 0.58203125, + "learning_rate": 3.520934368665641e-05, + "loss": 0.0380321478843689, + "mean_token_accuracy": 0.9908155652880669, + "num_tokens": 26323156.0, + "step": 24925 + }, + { + "entropy": 0.030784137591181208, + "epoch": 2.200180779999559, + "grad_norm": 1.5, + "learning_rate": 3.502823298089852e-05, + "loss": 0.034097914695739744, + "mean_token_accuracy": 0.9906668230891228, + "num_tokens": 26349731.0, + "step": 24950 + }, + { + "entropy": 0.027966852709814704, + "epoch": 2.2023854141405232, + "grad_norm": 1.171875, + "learning_rate": 3.4847490321628284e-05, + "loss": 0.03110057830810547, + "mean_token_accuracy": 0.9913800299167633, + "num_tokens": 26376290.0, + "step": 24975 + }, + { + "entropy": 0.03734282299010374, + "epoch": 2.2045900482814877, + "grad_norm": 0.271484375, + "learning_rate": 3.466711673270121e-05, + "loss": 0.04846214771270752, + "mean_token_accuracy": 0.9890136790275573, + "num_tokens": 26402644.0, + "step": 25000 + }, + { + "epoch": 2.2045900482814877, + "eval_entropy": 0.015888335370261596, + "eval_loss": 0.024382170289754868, + "eval_mean_token_accuracy": 0.9926599772876301, + "eval_num_tokens": 26402644.0, + "eval_runtime": 226.5246, + "eval_samples_per_second": 17.345, + "eval_steps_per_second": 4.339, + "step": 25000 + }, + { + "entropy": 0.03275453881615249, + "epoch": 2.206794682422452, + "grad_norm": 3.71875, + "learning_rate": 3.448711323588214e-05, + "loss": 0.03879266977310181, + "mean_token_accuracy": 0.9901682394742966, + "num_tokens": 26429152.0, + "step": 25025 + }, + { + "entropy": 0.02864964444357611, + "epoch": 2.2089993165634163, + "grad_norm": 1.4375, + "learning_rate": 3.4307480850839454e-05, + "loss": 0.03968175888061523, + "mean_token_accuracy": 0.9913057947158813, + "num_tokens": 26455036.0, + "step": 25050 + }, + { + "entropy": 0.02369352693334804, + "epoch": 2.211203950704381, + "grad_norm": 0.83984375, + "learning_rate": 3.4128220595139204e-05, + "loss": 0.025962200164794922, + "mean_token_accuracy": 0.9940554338693619, + "num_tokens": 26479705.0, + "step": 25075 + }, + { + "entropy": 0.028369134187960297, + "epoch": 2.213408584845345, + "grad_norm": 1.2109375, + "learning_rate": 3.394933348423957e-05, + "loss": 0.031112048625946045, + "mean_token_accuracy": 0.9922392535209655, + "num_tokens": 26506194.0, + "step": 25100 + }, + { + "entropy": 0.02832837748061138, + "epoch": 2.2156132189863094, + "grad_norm": 0.390625, + "learning_rate": 3.37708205314848e-05, + "loss": 0.031320822238922116, + "mean_token_accuracy": 0.9913709491491318, + "num_tokens": 26532032.0, + "step": 25125 + }, + { + "entropy": 0.023645088316334294, + "epoch": 2.2178178531272734, + "grad_norm": 4.0, + "learning_rate": 3.359268274809984e-05, + "loss": 0.027528271675109864, + "mean_token_accuracy": 0.9919891226291656, + "num_tokens": 26557094.0, + "step": 25150 + }, + { + "entropy": 0.027313509808845993, + "epoch": 2.220022487268238, + "grad_norm": 0.67578125, + "learning_rate": 3.341492114318424e-05, + "loss": 0.030699195861816405, + "mean_token_accuracy": 0.9918340718746186, + "num_tokens": 26583081.0, + "step": 25175 + }, + { + "entropy": 0.02981164409811754, + "epoch": 2.222227121409202, + "grad_norm": 1.2265625, + "learning_rate": 3.3237536723706705e-05, + "loss": 0.0402683687210083, + "mean_token_accuracy": 0.9895758175849915, + "num_tokens": 26609902.0, + "step": 25200 + }, + { + "entropy": 0.02851083199977438, + "epoch": 2.2244317555501665, + "grad_norm": 1.59375, + "learning_rate": 3.306053049449927e-05, + "loss": 0.03584902763366699, + "mean_token_accuracy": 0.9909912210702896, + "num_tokens": 26636633.0, + "step": 25225 + }, + { + "entropy": 0.02590751436029677, + "epoch": 2.226636389691131, + "grad_norm": 1.3984375, + "learning_rate": 3.2883903458251655e-05, + "loss": 0.02497697114944458, + "mean_token_accuracy": 0.9933020269870758, + "num_tokens": 26662757.0, + "step": 25250 + }, + { + "entropy": 0.025013647765863425, + "epoch": 2.228841023832095, + "grad_norm": 2.140625, + "learning_rate": 3.270765661550547e-05, + "loss": 0.027597415447235107, + "mean_token_accuracy": 0.9925726521015167, + "num_tokens": 26687762.0, + "step": 25275 + }, + { + "entropy": 0.028358331051313145, + "epoch": 2.2310456579730595, + "grad_norm": 2.296875, + "learning_rate": 3.253179096464874e-05, + "loss": 0.03725262403488159, + "mean_token_accuracy": 0.9898791015148163, + "num_tokens": 26713382.0, + "step": 25300 + }, + { + "entropy": 0.02306008890576777, + "epoch": 2.2332502921140236, + "grad_norm": 0.31640625, + "learning_rate": 3.235630750191008e-05, + "loss": 0.024607329368591307, + "mean_token_accuracy": 0.992639516890049, + "num_tokens": 26738687.0, + "step": 25325 + }, + { + "entropy": 0.03018416144524963, + "epoch": 2.235454926254988, + "grad_norm": 1.6328125, + "learning_rate": 3.2181207221353184e-05, + "loss": 0.03323997020721436, + "mean_token_accuracy": 0.990810919702053, + "num_tokens": 26766617.0, + "step": 25350 + }, + { + "entropy": 0.03350815741945553, + "epoch": 2.237659560395952, + "grad_norm": 2.3125, + "learning_rate": 3.200649111487102e-05, + "loss": 0.046819000244140624, + "mean_token_accuracy": 0.988080404996872, + "num_tokens": 26793542.0, + "step": 25375 + }, + { + "entropy": 0.02414072970594134, + "epoch": 2.2398641945369167, + "grad_norm": 0.1884765625, + "learning_rate": 3.1832160172180426e-05, + "loss": 0.0301043438911438, + "mean_token_accuracy": 0.9919259274005889, + "num_tokens": 26819952.0, + "step": 25400 + }, + { + "entropy": 0.03012874484673375, + "epoch": 2.2420688286778807, + "grad_norm": 1.453125, + "learning_rate": 3.165821538081637e-05, + "loss": 0.03285449743270874, + "mean_token_accuracy": 0.9908521872758865, + "num_tokens": 26845260.0, + "step": 25425 + }, + { + "entropy": 0.031216552823862, + "epoch": 2.244273462818845, + "grad_norm": 1.421875, + "learning_rate": 3.148465772612639e-05, + "loss": 0.03792816877365112, + "mean_token_accuracy": 0.9898707485198974, + "num_tokens": 26872952.0, + "step": 25450 + }, + { + "entropy": 0.02386282076506177, + "epoch": 2.2464780969598097, + "grad_norm": 0.3359375, + "learning_rate": 3.1311488191264926e-05, + "loss": 0.026478643417358397, + "mean_token_accuracy": 0.9940714892745018, + "num_tokens": 26897729.0, + "step": 25475 + }, + { + "entropy": 0.03102828201666853, + "epoch": 2.2486827311007738, + "grad_norm": 1.828125, + "learning_rate": 3.1138707757187925e-05, + "loss": 0.04361213684082031, + "mean_token_accuracy": 0.9898296114802361, + "num_tokens": 26924394.0, + "step": 25500 + }, + { + "entropy": 0.025993381780681377, + "epoch": 2.2508873652417383, + "grad_norm": 0.349609375, + "learning_rate": 3.096631740264718e-05, + "loss": 0.023462820053100585, + "mean_token_accuracy": 0.9935817018151283, + "num_tokens": 26950337.0, + "step": 25525 + }, + { + "entropy": 0.027647298633964965, + "epoch": 2.2530919993827023, + "grad_norm": 1.7265625, + "learning_rate": 3.079431810418473e-05, + "loss": 0.03757109642028809, + "mean_token_accuracy": 0.9909259453415871, + "num_tokens": 26975811.0, + "step": 25550 + }, + { + "entropy": 0.028983663184444595, + "epoch": 2.255296633523667, + "grad_norm": 0.94140625, + "learning_rate": 3.0622710836127474e-05, + "loss": 0.03265504121780396, + "mean_token_accuracy": 0.9918881016969681, + "num_tokens": 27002457.0, + "step": 25575 + }, + { + "entropy": 0.02929145427147887, + "epoch": 2.257501267664631, + "grad_norm": 1.578125, + "learning_rate": 3.0451496570581482e-05, + "loss": 0.036674625873565674, + "mean_token_accuracy": 0.9913082128763199, + "num_tokens": 27029283.0, + "step": 25600 + }, + { + "entropy": 0.0296330451936592, + "epoch": 2.2597059018055954, + "grad_norm": 0.8359375, + "learning_rate": 3.0280676277426655e-05, + "loss": 0.038873662948608396, + "mean_token_accuracy": 0.9899021616578102, + "num_tokens": 27055528.0, + "step": 25625 + }, + { + "entropy": 0.022964343342828215, + "epoch": 2.2619105359465594, + "grad_norm": 0.058349609375, + "learning_rate": 3.0110250924311157e-05, + "loss": 0.020480175018310547, + "mean_token_accuracy": 0.9950110822916031, + "num_tokens": 27081049.0, + "step": 25650 + }, + { + "entropy": 0.02799141699797474, + "epoch": 2.264115170087524, + "grad_norm": 1.3828125, + "learning_rate": 2.9940221476645815e-05, + "loss": 0.031359810829162595, + "mean_token_accuracy": 0.9911579310894012, + "num_tokens": 27107201.0, + "step": 25675 + }, + { + "entropy": 0.02933084479354875, + "epoch": 2.2663198042284884, + "grad_norm": 1.8203125, + "learning_rate": 2.9770588897598893e-05, + "loss": 0.03271200656890869, + "mean_token_accuracy": 0.9901346156001091, + "num_tokens": 27134550.0, + "step": 25700 + }, + { + "entropy": 0.028671488952568325, + "epoch": 2.2685244383694525, + "grad_norm": 0.546875, + "learning_rate": 2.9601354148090465e-05, + "loss": 0.035420951843261717, + "mean_token_accuracy": 0.9904685345292091, + "num_tokens": 27161406.0, + "step": 25725 + }, + { + "entropy": 0.02534457077494153, + "epoch": 2.270729072510417, + "grad_norm": 0.7890625, + "learning_rate": 2.943251818678704e-05, + "loss": 0.028134129047393798, + "mean_token_accuracy": 0.991405982375145, + "num_tokens": 27187953.0, + "step": 25750 + }, + { + "entropy": 0.03393531416601036, + "epoch": 2.272933706651381, + "grad_norm": 0.451171875, + "learning_rate": 2.9264081970096034e-05, + "loss": 0.05243350982666015, + "mean_token_accuracy": 0.9875575861334801, + "num_tokens": 27215332.0, + "step": 25775 + }, + { + "entropy": 0.023412532548227317, + "epoch": 2.2751383407923456, + "grad_norm": 0.26171875, + "learning_rate": 2.909604645216045e-05, + "loss": 0.029395694732666015, + "mean_token_accuracy": 0.9933974233269691, + "num_tokens": 27241193.0, + "step": 25800 + }, + { + "entropy": 0.027852850720009883, + "epoch": 2.27734297493331, + "grad_norm": 1.09375, + "learning_rate": 2.8928412584853494e-05, + "loss": 0.034067320823669436, + "mean_token_accuracy": 0.9859316512942314, + "num_tokens": 27268202.0, + "step": 25825 + }, + { + "entropy": 0.02478776458385255, + "epoch": 2.279547609074274, + "grad_norm": 0.8984375, + "learning_rate": 2.876118131777311e-05, + "loss": 0.031112759113311766, + "mean_token_accuracy": 0.9908994352817535, + "num_tokens": 27293667.0, + "step": 25850 + }, + { + "entropy": 0.020712875399785845, + "epoch": 2.2817522432152386, + "grad_norm": 2.28125, + "learning_rate": 2.859435359823659e-05, + "loss": 0.02065124988555908, + "mean_token_accuracy": 0.9933943581581116, + "num_tokens": 27318458.0, + "step": 25875 + }, + { + "entropy": 0.024171373999743083, + "epoch": 2.2839568773562027, + "grad_norm": 0.703125, + "learning_rate": 2.84279303712753e-05, + "loss": 0.02398313283920288, + "mean_token_accuracy": 0.9932537263631821, + "num_tokens": 27343651.0, + "step": 25900 + }, + { + "entropy": 0.03080252897207174, + "epoch": 2.286161511497167, + "grad_norm": 0.6171875, + "learning_rate": 2.8261912579629248e-05, + "loss": 0.03876256704330444, + "mean_token_accuracy": 0.9887584137916565, + "num_tokens": 27371576.0, + "step": 25925 + }, + { + "entropy": 0.030703878802341933, + "epoch": 2.2883661456381312, + "grad_norm": 0.57421875, + "learning_rate": 2.8096301163741755e-05, + "loss": 0.03663143157958984, + "mean_token_accuracy": 0.9901211553812027, + "num_tokens": 27398292.0, + "step": 25950 + }, + { + "entropy": 0.031650141541394984, + "epoch": 2.2905707797790957, + "grad_norm": 1.2734375, + "learning_rate": 2.7931097061754197e-05, + "loss": 0.04376762390136719, + "mean_token_accuracy": 0.9910077887773514, + "num_tokens": 27425104.0, + "step": 25975 + }, + { + "entropy": 0.029165671150476555, + "epoch": 2.29277541392006, + "grad_norm": 0.5703125, + "learning_rate": 2.7766301209500543e-05, + "loss": 0.03497497081756592, + "mean_token_accuracy": 0.9903140386939049, + "num_tokens": 27451361.0, + "step": 26000 + }, + { + "epoch": 2.29277541392006, + "eval_entropy": 0.015544790721468586, + "eval_loss": 0.024325313046574593, + "eval_mean_token_accuracy": 0.9927062318865911, + "eval_num_tokens": 27451361.0, + "eval_runtime": 226.2629, + "eval_samples_per_second": 17.365, + "eval_steps_per_second": 4.345, + "step": 26000 + }, + { + "entropy": 0.03006629309566051, + "epoch": 2.2949800480610243, + "grad_norm": 0.62109375, + "learning_rate": 2.7601914540502172e-05, + "loss": 0.03512540340423584, + "mean_token_accuracy": 0.9910671037435531, + "num_tokens": 27478307.0, + "step": 26025 + }, + { + "entropy": 0.023125871705269675, + "epoch": 2.297184682201989, + "grad_norm": 0.3828125, + "learning_rate": 2.743793798596259e-05, + "loss": 0.02564392566680908, + "mean_token_accuracy": 0.9933660838007927, + "num_tokens": 27503140.0, + "step": 26050 + }, + { + "entropy": 0.030678539160126094, + "epoch": 2.299389316342953, + "grad_norm": 1.953125, + "learning_rate": 2.7274372474762154e-05, + "loss": 0.03678859710693359, + "mean_token_accuracy": 0.9891363000869751, + "num_tokens": 27529219.0, + "step": 26075 + }, + { + "entropy": 0.028060704695126334, + "epoch": 2.3015939504839174, + "grad_norm": 2.4375, + "learning_rate": 2.7111218933452654e-05, + "loss": 0.027757613658905028, + "mean_token_accuracy": 0.9933583897352218, + "num_tokens": 27554880.0, + "step": 26100 + }, + { + "entropy": 0.03153505565591331, + "epoch": 2.3037985846248814, + "grad_norm": 1.6484375, + "learning_rate": 2.694847828625229e-05, + "loss": 0.04007414340972901, + "mean_token_accuracy": 0.9893001061677933, + "num_tokens": 27582143.0, + "step": 26125 + }, + { + "entropy": 0.03299531255826878, + "epoch": 2.306003218765846, + "grad_norm": 2.203125, + "learning_rate": 2.678615145504032e-05, + "loss": 0.0395844316482544, + "mean_token_accuracy": 0.9888230460882187, + "num_tokens": 27609956.0, + "step": 26150 + }, + { + "entropy": 0.03849572095681651, + "epoch": 2.30820785290681, + "grad_norm": 1.265625, + "learning_rate": 2.6624239359351856e-05, + "loss": 0.05094009876251221, + "mean_token_accuracy": 0.9879861554503441, + "num_tokens": 27637690.0, + "step": 26175 + }, + { + "entropy": 0.02834760202822508, + "epoch": 2.3104124870477745, + "grad_norm": 0.7890625, + "learning_rate": 2.6462742916372597e-05, + "loss": 0.03409828186035156, + "mean_token_accuracy": 0.9913964113593101, + "num_tokens": 27664105.0, + "step": 26200 + }, + { + "entropy": 0.024753890240317558, + "epoch": 2.3126171211887385, + "grad_norm": 3.1875, + "learning_rate": 2.6301663040933777e-05, + "loss": 0.022567291259765625, + "mean_token_accuracy": 0.993626494705677, + "num_tokens": 27690161.0, + "step": 26225 + }, + { + "entropy": 0.028063594494415155, + "epoch": 2.314821755329703, + "grad_norm": 0.443359375, + "learning_rate": 2.6141000645506786e-05, + "loss": 0.036960182189941404, + "mean_token_accuracy": 0.9914696502685547, + "num_tokens": 27716880.0, + "step": 26250 + }, + { + "entropy": 0.0276568841507833, + "epoch": 2.3170263894706675, + "grad_norm": 2.09375, + "learning_rate": 2.598075664019822e-05, + "loss": 0.04201683521270752, + "mean_token_accuracy": 0.9902168083190918, + "num_tokens": 27743023.0, + "step": 26275 + }, + { + "entropy": 0.0294130508035596, + "epoch": 2.3192310236116316, + "grad_norm": 1.234375, + "learning_rate": 2.582093193274452e-05, + "loss": 0.03933499336242676, + "mean_token_accuracy": 0.9877358794212341, + "num_tokens": 27769932.0, + "step": 26300 + }, + { + "entropy": 0.023152426210435804, + "epoch": 2.321435657752596, + "grad_norm": 0.490234375, + "learning_rate": 2.566152742850697e-05, + "loss": 0.02730790138244629, + "mean_token_accuracy": 0.9925672444701195, + "num_tokens": 27795497.0, + "step": 26325 + }, + { + "entropy": 0.03227209112326818, + "epoch": 2.32364029189356, + "grad_norm": 0.8984375, + "learning_rate": 2.5502544030466547e-05, + "loss": 0.04344738960266113, + "mean_token_accuracy": 0.9895595768094063, + "num_tokens": 27822418.0, + "step": 26350 + }, + { + "entropy": 0.025001196329394588, + "epoch": 2.3258449260345246, + "grad_norm": 1.296875, + "learning_rate": 2.5343982639218778e-05, + "loss": 0.02424089431762695, + "mean_token_accuracy": 0.9937809437513352, + "num_tokens": 27848000.0, + "step": 26375 + }, + { + "entropy": 0.03385282089677275, + "epoch": 2.3280495601754887, + "grad_norm": 0.283203125, + "learning_rate": 2.5185844152968552e-05, + "loss": 0.044082775115966796, + "mean_token_accuracy": 0.9884610024094581, + "num_tokens": 27872983.0, + "step": 26400 + }, + { + "entropy": 0.028122581911738963, + "epoch": 2.330254194316453, + "grad_norm": 0.130859375, + "learning_rate": 2.502812946752523e-05, + "loss": 0.033366072177886966, + "mean_token_accuracy": 0.9861757379770278, + "num_tokens": 27900430.0, + "step": 26425 + }, + { + "entropy": 0.03773002337271464, + "epoch": 2.3324588284574173, + "grad_norm": 0.62109375, + "learning_rate": 2.4870839476297437e-05, + "loss": 0.04225398063659668, + "mean_token_accuracy": 0.988186694085598, + "num_tokens": 27929118.0, + "step": 26450 + }, + { + "entropy": 0.02792194237914373, + "epoch": 2.3346634625983818, + "grad_norm": 1.3359375, + "learning_rate": 2.4713975070287986e-05, + "loss": 0.03388465404510498, + "mean_token_accuracy": 0.9922302371263504, + "num_tokens": 27955837.0, + "step": 26475 + }, + { + "entropy": 0.026086576611960482, + "epoch": 2.3368680967393463, + "grad_norm": 0.275390625, + "learning_rate": 2.4557537138088872e-05, + "loss": 0.03237960815429688, + "mean_token_accuracy": 0.9911448901891708, + "num_tokens": 27982908.0, + "step": 26500 + }, + { + "entropy": 0.0317309611546807, + "epoch": 2.3390727308803103, + "grad_norm": 1.6484375, + "learning_rate": 2.4401526565876286e-05, + "loss": 0.03967868804931641, + "mean_token_accuracy": 0.9839690843224526, + "num_tokens": 28010234.0, + "step": 26525 + }, + { + "entropy": 0.023329728747648916, + "epoch": 2.341277365021275, + "grad_norm": 0.8828125, + "learning_rate": 2.4245944237405525e-05, + "loss": 0.01898858666419983, + "mean_token_accuracy": 0.9931737449765206, + "num_tokens": 28035715.0, + "step": 26550 + }, + { + "entropy": 0.02648252934213815, + "epoch": 2.343481999162239, + "grad_norm": 0.4140625, + "learning_rate": 2.4090791034006044e-05, + "loss": 0.02758594036102295, + "mean_token_accuracy": 0.9942780634760857, + "num_tokens": 28061283.0, + "step": 26575 + }, + { + "entropy": 0.03423773757087474, + "epoch": 2.3456866333032034, + "grad_norm": 2.625, + "learning_rate": 2.3936067834576324e-05, + "loss": 0.045157794952392576, + "mean_token_accuracy": 0.9892179015278816, + "num_tokens": 28089089.0, + "step": 26600 + }, + { + "entropy": 0.03164266299670999, + "epoch": 2.347891267444168, + "grad_norm": 0.3984375, + "learning_rate": 2.3781775515579087e-05, + "loss": 0.03755315780639648, + "mean_token_accuracy": 0.9894537970423698, + "num_tokens": 28115789.0, + "step": 26625 + }, + { + "entropy": 0.027904597169217595, + "epoch": 2.350095901585132, + "grad_norm": 0.62109375, + "learning_rate": 2.3627914951036212e-05, + "loss": 0.03878262996673584, + "mean_token_accuracy": 0.9887554702162743, + "num_tokens": 28141834.0, + "step": 26650 + }, + { + "entropy": 0.026722210118896327, + "epoch": 2.3523005357260964, + "grad_norm": 0.431640625, + "learning_rate": 2.347448701252386e-05, + "loss": 0.02975456714630127, + "mean_token_accuracy": 0.9917343974113464, + "num_tokens": 28168832.0, + "step": 26675 + }, + { + "entropy": 0.027321821880686912, + "epoch": 2.3545051698670605, + "grad_norm": 0.306640625, + "learning_rate": 2.3321492569167402e-05, + "loss": 0.033426897525787355, + "mean_token_accuracy": 0.9895308339595794, + "num_tokens": 28195876.0, + "step": 26700 + }, + { + "entropy": 0.028027048043622927, + "epoch": 2.356709804008025, + "grad_norm": 0.49609375, + "learning_rate": 2.3168932487636595e-05, + "loss": 0.0321111536026001, + "mean_token_accuracy": 0.9926772129535675, + "num_tokens": 28222533.0, + "step": 26725 + }, + { + "entropy": 0.03335825680707785, + "epoch": 2.358914438148989, + "grad_norm": 2.53125, + "learning_rate": 2.3016807632140735e-05, + "loss": 0.03918349027633667, + "mean_token_accuracy": 0.9899929386377334, + "num_tokens": 28249493.0, + "step": 26750 + }, + { + "entropy": 0.02355239788603285, + "epoch": 2.3611190722899535, + "grad_norm": 0.337890625, + "learning_rate": 2.286511886442365e-05, + "loss": 0.027818148136138917, + "mean_token_accuracy": 0.9934883451461792, + "num_tokens": 28274936.0, + "step": 26775 + }, + { + "entropy": 0.03090159765084536, + "epoch": 2.3633237064309176, + "grad_norm": 0.69140625, + "learning_rate": 2.271386704375881e-05, + "loss": 0.0381850004196167, + "mean_token_accuracy": 0.9901610732078552, + "num_tokens": 28302998.0, + "step": 26800 + }, + { + "entropy": 0.030783424060173273, + "epoch": 2.365528340571882, + "grad_norm": 2.46875, + "learning_rate": 2.2563053026944557e-05, + "loss": 0.03481113433837891, + "mean_token_accuracy": 0.9903145882487298, + "num_tokens": 28330381.0, + "step": 26825 + }, + { + "entropy": 0.02382873230339101, + "epoch": 2.3677329747128466, + "grad_norm": 0.470703125, + "learning_rate": 2.2412677668299197e-05, + "loss": 0.022453012466430663, + "mean_token_accuracy": 0.9938893175125122, + "num_tokens": 28356826.0, + "step": 26850 + }, + { + "entropy": 0.030640801413319423, + "epoch": 2.3699376088538107, + "grad_norm": 1.296875, + "learning_rate": 2.2262741819656173e-05, + "loss": 0.043948798179626464, + "mean_token_accuracy": 0.989762376844883, + "num_tokens": 28383268.0, + "step": 26875 + }, + { + "entropy": 0.03442299180154805, + "epoch": 2.372142242994775, + "grad_norm": 1.484375, + "learning_rate": 2.211324633035916e-05, + "loss": 0.04683504581451416, + "mean_token_accuracy": 0.9879697850346565, + "num_tokens": 28409715.0, + "step": 26900 + }, + { + "entropy": 0.028344044167606625, + "epoch": 2.374346877135739, + "grad_norm": 0.55078125, + "learning_rate": 2.1964192047257415e-05, + "loss": 0.03574753046035767, + "mean_token_accuracy": 0.9921541050076484, + "num_tokens": 28436311.0, + "step": 26925 + }, + { + "entropy": 0.029762789253472876, + "epoch": 2.3765515112767037, + "grad_norm": 0.478515625, + "learning_rate": 2.1815579814700793e-05, + "loss": 0.034858622550964356, + "mean_token_accuracy": 0.9897570988535881, + "num_tokens": 28463719.0, + "step": 26950 + }, + { + "entropy": 0.02525413668121473, + "epoch": 2.3787561454176678, + "grad_norm": 2.203125, + "learning_rate": 2.1667410474535134e-05, + "loss": 0.029147915840148926, + "mean_token_accuracy": 0.9873283988237381, + "num_tokens": 28490183.0, + "step": 26975 + }, + { + "entropy": 0.024068699735253177, + "epoch": 2.3809607795586323, + "grad_norm": 0.765625, + "learning_rate": 2.1519684866097432e-05, + "loss": 0.022618768215179445, + "mean_token_accuracy": 0.992685379087925, + "num_tokens": 28515389.0, + "step": 27000 + }, + { + "epoch": 2.3809607795586323, + "eval_entropy": 0.01530946354798135, + "eval_loss": 0.024163657799363136, + "eval_mean_token_accuracy": 0.9927907401765957, + "eval_num_tokens": 28515389.0, + "eval_runtime": 226.0753, + "eval_samples_per_second": 17.379, + "eval_steps_per_second": 4.348, + "step": 27000 + }, + { + "entropy": 0.0272020617283124, + "epoch": 2.3831654136995963, + "grad_norm": 1.046875, + "learning_rate": 2.1372403826210974e-05, + "loss": 0.039848828315734865, + "mean_token_accuracy": 0.9914286798238754, + "num_tokens": 28541578.0, + "step": 27025 + }, + { + "entropy": 0.025422237896564184, + "epoch": 2.385370047840561, + "grad_norm": 2.703125, + "learning_rate": 2.1225568189180768e-05, + "loss": 0.02730207920074463, + "mean_token_accuracy": 0.9945016172528267, + "num_tokens": 28567968.0, + "step": 27050 + }, + { + "entropy": 0.030367342436147738, + "epoch": 2.3875746819815253, + "grad_norm": 1.984375, + "learning_rate": 2.1079178786788735e-05, + "loss": 0.04000374317169189, + "mean_token_accuracy": 0.9900528371334076, + "num_tokens": 28594455.0, + "step": 27075 + }, + { + "entropy": 0.025867001799779247, + "epoch": 2.3897793161224894, + "grad_norm": 0.6796875, + "learning_rate": 2.0933236448289006e-05, + "loss": 0.027899935245513915, + "mean_token_accuracy": 0.9905596524477005, + "num_tokens": 28619924.0, + "step": 27100 + }, + { + "entropy": 0.026652738297161704, + "epoch": 2.391983950263454, + "grad_norm": 0.46875, + "learning_rate": 2.0787742000403177e-05, + "loss": 0.031079134941101073, + "mean_token_accuracy": 0.9919494143128396, + "num_tokens": 28646733.0, + "step": 27125 + }, + { + "entropy": 0.028722508352439037, + "epoch": 2.394188584404418, + "grad_norm": 1.4375, + "learning_rate": 2.064269626731573e-05, + "loss": 0.03450024604797363, + "mean_token_accuracy": 0.9902220144867897, + "num_tokens": 28672305.0, + "step": 27150 + }, + { + "entropy": 0.02438920122654963, + "epoch": 2.3963932185453825, + "grad_norm": 1.828125, + "learning_rate": 2.0498100070669256e-05, + "loss": 0.031473851203918456, + "mean_token_accuracy": 0.9928356763720513, + "num_tokens": 28699490.0, + "step": 27175 + }, + { + "entropy": 0.02671077159993729, + "epoch": 2.3985978526863465, + "grad_norm": 0.490234375, + "learning_rate": 2.0353954229559925e-05, + "loss": 0.03216356515884399, + "mean_token_accuracy": 0.9926029777526856, + "num_tokens": 28725754.0, + "step": 27200 + }, + { + "entropy": 0.028233824653707416, + "epoch": 2.400802486827311, + "grad_norm": 0.72265625, + "learning_rate": 2.0210259560532652e-05, + "loss": 0.03836059808731079, + "mean_token_accuracy": 0.9901475051045417, + "num_tokens": 28753121.0, + "step": 27225 + }, + { + "entropy": 0.025337186203068997, + "epoch": 2.403007120968275, + "grad_norm": 4.28125, + "learning_rate": 2.0067016877576705e-05, + "loss": 0.02860053539276123, + "mean_token_accuracy": 0.9930732557177544, + "num_tokens": 28778679.0, + "step": 27250 + }, + { + "entropy": 0.02454310272631119, + "epoch": 2.4052117551092396, + "grad_norm": 0.451171875, + "learning_rate": 1.9924226992120922e-05, + "loss": 0.02276606559753418, + "mean_token_accuracy": 0.9938413736224174, + "num_tokens": 28805474.0, + "step": 27275 + }, + { + "entropy": 0.03040276386695041, + "epoch": 2.407416389250204, + "grad_norm": 2.5, + "learning_rate": 1.978189071302923e-05, + "loss": 0.03786967277526856, + "mean_token_accuracy": 0.9903272116184234, + "num_tokens": 28832117.0, + "step": 27300 + }, + { + "entropy": 0.02637321831722147, + "epoch": 2.409621023391168, + "grad_norm": 0.2734375, + "learning_rate": 1.9640008846595882e-05, + "loss": 0.028791847229003905, + "mean_token_accuracy": 0.9920576027035714, + "num_tokens": 28858739.0, + "step": 27325 + }, + { + "entropy": 0.02872723880540434, + "epoch": 2.4118256575321326, + "grad_norm": 0.34765625, + "learning_rate": 1.9498582196541182e-05, + "loss": 0.02915778636932373, + "mean_token_accuracy": 0.9923367646336555, + "num_tokens": 28884949.0, + "step": 27350 + }, + { + "entropy": 0.026446953547056184, + "epoch": 2.4140302916730967, + "grad_norm": 0.52734375, + "learning_rate": 1.9357611564006594e-05, + "loss": 0.03899633646011352, + "mean_token_accuracy": 0.9909557834267616, + "num_tokens": 28911648.0, + "step": 27375 + }, + { + "entropy": 0.028178447782229343, + "epoch": 2.416234925814061, + "grad_norm": 1.5546875, + "learning_rate": 1.9217097747550518e-05, + "loss": 0.03861397743225098, + "mean_token_accuracy": 0.990208261013031, + "num_tokens": 28937290.0, + "step": 27400 + }, + { + "entropy": 0.02791525627584633, + "epoch": 2.4184395599550257, + "grad_norm": 1.65625, + "learning_rate": 1.9077041543143515e-05, + "loss": 0.034581294059753416, + "mean_token_accuracy": 0.989730831682682, + "num_tokens": 28963573.0, + "step": 27425 + }, + { + "entropy": 0.02653196087485412, + "epoch": 2.4206441940959897, + "grad_norm": 0.54296875, + "learning_rate": 1.893744374416395e-05, + "loss": 0.028083674907684326, + "mean_token_accuracy": 0.9924825802445412, + "num_tokens": 28990395.0, + "step": 27450 + }, + { + "entropy": 0.0226342674649095, + "epoch": 2.4228488282369542, + "grad_norm": 1.875, + "learning_rate": 1.8798305141393468e-05, + "loss": 0.03195946455001831, + "mean_token_accuracy": 0.9923165252804756, + "num_tokens": 29016564.0, + "step": 27475 + }, + { + "entropy": 0.02829852351467707, + "epoch": 2.4250534623779183, + "grad_norm": 1.28125, + "learning_rate": 1.865962652301251e-05, + "loss": 0.03136762857437134, + "mean_token_accuracy": 0.9916561701893807, + "num_tokens": 29043714.0, + "step": 27500 + }, + { + "entropy": 0.027737032295808605, + "epoch": 2.427258096518883, + "grad_norm": 0.70703125, + "learning_rate": 1.8521408674595742e-05, + "loss": 0.027605950832366943, + "mean_token_accuracy": 0.9930538147687912, + "num_tokens": 29070351.0, + "step": 27525 + }, + { + "entropy": 0.02750062160557718, + "epoch": 2.429462730659847, + "grad_norm": 0.8828125, + "learning_rate": 1.8383652379107787e-05, + "loss": 0.033921008110046384, + "mean_token_accuracy": 0.9925847980380058, + "num_tokens": 29096293.0, + "step": 27550 + }, + { + "entropy": 0.02087086517283751, + "epoch": 2.4316673648008114, + "grad_norm": 4.625, + "learning_rate": 1.8246358416898724e-05, + "loss": 0.026158723831176758, + "mean_token_accuracy": 0.9937206152081489, + "num_tokens": 29121703.0, + "step": 27575 + }, + { + "entropy": 0.02890322234330597, + "epoch": 2.4338719989417754, + "grad_norm": 0.640625, + "learning_rate": 1.8109527565699536e-05, + "loss": 0.037437882423400876, + "mean_token_accuracy": 0.9888174629211426, + "num_tokens": 29148133.0, + "step": 27600 + }, + { + "entropy": 0.02840543278405676, + "epoch": 2.43607663308274, + "grad_norm": 0.83984375, + "learning_rate": 1.7973160600617944e-05, + "loss": 0.029362483024597166, + "mean_token_accuracy": 0.9919123184680939, + "num_tokens": 29174862.0, + "step": 27625 + }, + { + "entropy": 0.029121692281260037, + "epoch": 2.4382812672237044, + "grad_norm": 2.859375, + "learning_rate": 1.7837258294133764e-05, + "loss": 0.033895456790924074, + "mean_token_accuracy": 0.9921913406252861, + "num_tokens": 29201323.0, + "step": 27650 + }, + { + "entropy": 0.026827582789373992, + "epoch": 2.4404859013646685, + "grad_norm": 0.77734375, + "learning_rate": 1.7701821416094745e-05, + "loss": 0.029248933792114257, + "mean_token_accuracy": 0.9929554259777069, + "num_tokens": 29227216.0, + "step": 27675 + }, + { + "entropy": 0.020511043658625567, + "epoch": 2.442690535505633, + "grad_norm": 1.5859375, + "learning_rate": 1.7566850733712105e-05, + "loss": 0.02211562156677246, + "mean_token_accuracy": 0.9927102276682853, + "num_tokens": 29252647.0, + "step": 27700 + }, + { + "entropy": 0.025656236830272974, + "epoch": 2.444895169646597, + "grad_norm": 1.015625, + "learning_rate": 1.743234701155614e-05, + "loss": 0.02908522605895996, + "mean_token_accuracy": 0.9913544818758965, + "num_tokens": 29278335.0, + "step": 27725 + }, + { + "entropy": 0.022964876653477403, + "epoch": 2.4470998037875615, + "grad_norm": 1.7734375, + "learning_rate": 1.7298311011551995e-05, + "loss": 0.02672830820083618, + "mean_token_accuracy": 0.9933206418156624, + "num_tokens": 29303676.0, + "step": 27750 + }, + { + "entropy": 0.03414529487632535, + "epoch": 2.4493044379285256, + "grad_norm": 0.88671875, + "learning_rate": 1.716474349297531e-05, + "loss": 0.04021317481994629, + "mean_token_accuracy": 0.9896806997060775, + "num_tokens": 29331995.0, + "step": 27775 + }, + { + "entropy": 0.024354764178715415, + "epoch": 2.45150907206949, + "grad_norm": 2.890625, + "learning_rate": 1.7031645212447913e-05, + "loss": 0.029198453426361085, + "mean_token_accuracy": 0.9919512045383453, + "num_tokens": 29357249.0, + "step": 27800 + }, + { + "entropy": 0.03601440033242397, + "epoch": 2.453713706210454, + "grad_norm": 4.28125, + "learning_rate": 1.68990169239335e-05, + "loss": 0.04773323059082031, + "mean_token_accuracy": 0.9883570069074631, + "num_tokens": 29384591.0, + "step": 27825 + }, + { + "entropy": 0.025379649180413254, + "epoch": 2.4559183403514186, + "grad_norm": 0.91015625, + "learning_rate": 1.676685937873337e-05, + "loss": 0.03554457426071167, + "mean_token_accuracy": 0.9918042355775833, + "num_tokens": 29411165.0, + "step": 27850 + }, + { + "entropy": 0.030096235661039826, + "epoch": 2.458122974492383, + "grad_norm": 0.51171875, + "learning_rate": 1.663517332548229e-05, + "loss": 0.03285525560379028, + "mean_token_accuracy": 0.9914932417869567, + "num_tokens": 29436990.0, + "step": 27875 + }, + { + "entropy": 0.02419358774619468, + "epoch": 2.460327608633347, + "grad_norm": 3.609375, + "learning_rate": 1.6503959510144106e-05, + "loss": 0.03251128196716309, + "mean_token_accuracy": 0.9923573270440101, + "num_tokens": 29462865.0, + "step": 27900 + }, + { + "entropy": 0.028468720067612596, + "epoch": 2.4625322427743117, + "grad_norm": 0.2021484375, + "learning_rate": 1.6373218676007553e-05, + "loss": 0.0337121057510376, + "mean_token_accuracy": 0.9913557228446007, + "num_tokens": 29489633.0, + "step": 27925 + }, + { + "entropy": 0.024808133746118982, + "epoch": 2.4647368769152758, + "grad_norm": 0.11767578125, + "learning_rate": 1.6242951563682086e-05, + "loss": 0.028612098693847655, + "mean_token_accuracy": 0.9922372248768806, + "num_tokens": 29514759.0, + "step": 27950 + }, + { + "entropy": 0.026218910121970113, + "epoch": 2.4669415110562403, + "grad_norm": 1.6875, + "learning_rate": 1.611315891109367e-05, + "loss": 0.02892878532409668, + "mean_token_accuracy": 0.9922880592942238, + "num_tokens": 29540408.0, + "step": 27975 + }, + { + "entropy": 0.028717517795703317, + "epoch": 2.4691461451972048, + "grad_norm": 3.234375, + "learning_rate": 1.5983841453480576e-05, + "loss": 0.03306084871292114, + "mean_token_accuracy": 0.9898355248570442, + "num_tokens": 29567167.0, + "step": 28000 + }, + { + "epoch": 2.4691461451972048, + "eval_entropy": 0.015174210387057532, + "eval_loss": 0.024244122207164764, + "eval_mean_token_accuracy": 0.9927880360175488, + "eval_num_tokens": 29567167.0, + "eval_runtime": 246.3571, + "eval_samples_per_second": 15.948, + "eval_steps_per_second": 3.99, + "step": 28000 + }, + { + "entropy": 0.02790546601085225, + "epoch": 2.471350779338169, + "grad_norm": 3.484375, + "learning_rate": 1.5854999923389258e-05, + "loss": 0.031562213897705076, + "mean_token_accuracy": 0.9908042460680008, + "num_tokens": 29593800.0, + "step": 28025 + }, + { + "entropy": 0.029840830979082966, + "epoch": 2.4735554134791333, + "grad_norm": 1.7890625, + "learning_rate": 1.5726635050670123e-05, + "loss": 0.036235096454620364, + "mean_token_accuracy": 0.9905793526768685, + "num_tokens": 29619591.0, + "step": 28050 + }, + { + "entropy": 0.029747921650014178, + "epoch": 2.4757600476200974, + "grad_norm": 1.1796875, + "learning_rate": 1.559874756247347e-05, + "loss": 0.03456358909606934, + "mean_token_accuracy": 0.9919943836331367, + "num_tokens": 29646045.0, + "step": 28075 + }, + { + "entropy": 0.02587532730682142, + "epoch": 2.477964681761062, + "grad_norm": 0.4609375, + "learning_rate": 1.5471338183245386e-05, + "loss": 0.032157759666442874, + "mean_token_accuracy": 0.9919455042481422, + "num_tokens": 29673216.0, + "step": 28100 + }, + { + "entropy": 0.023125849339194246, + "epoch": 2.480169315902026, + "grad_norm": 0.06591796875, + "learning_rate": 1.534440763472361e-05, + "loss": 0.028566434383392333, + "mean_token_accuracy": 0.9923149171471596, + "num_tokens": 29698828.0, + "step": 28125 + }, + { + "entropy": 0.028684415022034955, + "epoch": 2.4823739500429904, + "grad_norm": 0.376953125, + "learning_rate": 1.521795663593335e-05, + "loss": 0.033868846893310545, + "mean_token_accuracy": 0.9930822342634201, + "num_tokens": 29724874.0, + "step": 28150 + }, + { + "entropy": 0.02750486832945171, + "epoch": 2.4845785841839545, + "grad_norm": 0.95703125, + "learning_rate": 1.5091985903183415e-05, + "loss": 0.03277843952178955, + "mean_token_accuracy": 0.9917207375168801, + "num_tokens": 29751693.0, + "step": 28175 + }, + { + "entropy": 0.02878666256321594, + "epoch": 2.486783218324919, + "grad_norm": 3.671875, + "learning_rate": 1.496649615006197e-05, + "loss": 0.03613584280014038, + "mean_token_accuracy": 0.9895001071691513, + "num_tokens": 29779275.0, + "step": 28200 + }, + { + "entropy": 0.023778437117980503, + "epoch": 2.4889878524658835, + "grad_norm": 0.158203125, + "learning_rate": 1.4841488087432642e-05, + "loss": 0.035824286937713626, + "mean_token_accuracy": 0.991601872742176, + "num_tokens": 29805152.0, + "step": 28225 + }, + { + "entropy": 0.028207669479197648, + "epoch": 2.4911924866068476, + "grad_norm": 1.46875, + "learning_rate": 1.4716962423430314e-05, + "loss": 0.03573413610458374, + "mean_token_accuracy": 0.9917686119675636, + "num_tokens": 29831523.0, + "step": 28250 + }, + { + "entropy": 0.027834644988688526, + "epoch": 2.493397120747812, + "grad_norm": 1.1640625, + "learning_rate": 1.4592919863457332e-05, + "loss": 0.03196176767349243, + "mean_token_accuracy": 0.9919345453381538, + "num_tokens": 29858086.0, + "step": 28275 + }, + { + "entropy": 0.030679253008656815, + "epoch": 2.495601754888776, + "grad_norm": 0.90234375, + "learning_rate": 1.446936111017928e-05, + "loss": 0.03689356803894043, + "mean_token_accuracy": 0.9908839726448059, + "num_tokens": 29883903.0, + "step": 28300 + }, + { + "entropy": 0.027692356215484323, + "epoch": 2.4978063890297406, + "grad_norm": 0.53515625, + "learning_rate": 1.434628686352123e-05, + "loss": 0.030295934677124024, + "mean_token_accuracy": 0.9929940468072891, + "num_tokens": 29910032.0, + "step": 28325 + }, + { + "entropy": 0.03307623949665867, + "epoch": 2.5000110231707047, + "grad_norm": 4.3125, + "learning_rate": 1.422369782066355e-05, + "loss": 0.042545747756958005, + "mean_token_accuracy": 0.9901801961660385, + "num_tokens": 29937722.0, + "step": 28350 + }, + { + "entropy": 0.024893675402163352, + "epoch": 2.502215657311669, + "grad_norm": 0.47265625, + "learning_rate": 1.4101594676038165e-05, + "loss": 0.03002509593963623, + "mean_token_accuracy": 0.9913722136616707, + "num_tokens": 29963785.0, + "step": 28375 + }, + { + "entropy": 0.02339222040751338, + "epoch": 2.5044202914526332, + "grad_norm": 0.703125, + "learning_rate": 1.3979978121324488e-05, + "loss": 0.025294029712677003, + "mean_token_accuracy": 0.9939557710289955, + "num_tokens": 29990305.0, + "step": 28400 + }, + { + "entropy": 0.021295039523283776, + "epoch": 2.5066249255935977, + "grad_norm": 1.0859375, + "learning_rate": 1.3858848845445538e-05, + "loss": 0.01983662486076355, + "mean_token_accuracy": 0.9948306560516358, + "num_tokens": 30015670.0, + "step": 28425 + }, + { + "entropy": 0.025478222227975494, + "epoch": 2.5088295597345622, + "grad_norm": 2.46875, + "learning_rate": 1.3738207534564007e-05, + "loss": 0.0268137526512146, + "mean_token_accuracy": 0.9927583593130112, + "num_tokens": 30041298.0, + "step": 28450 + }, + { + "entropy": 0.027219222592430016, + "epoch": 2.5110341938755263, + "grad_norm": 0.6328125, + "learning_rate": 1.3618054872078456e-05, + "loss": 0.03663578748703003, + "mean_token_accuracy": 0.9898635944724083, + "num_tokens": 30067356.0, + "step": 28475 + }, + { + "entropy": 0.02611872347224562, + "epoch": 2.513238828016491, + "grad_norm": 0.5234375, + "learning_rate": 1.3498391538619405e-05, + "loss": 0.029873659610748293, + "mean_token_accuracy": 0.9930629214644432, + "num_tokens": 30094241.0, + "step": 28500 + }, + { + "entropy": 0.029561223531491123, + "epoch": 2.515443462157455, + "grad_norm": 1.0078125, + "learning_rate": 1.3379218212045386e-05, + "loss": 0.029539554119110106, + "mean_token_accuracy": 0.9937109515070915, + "num_tokens": 30119806.0, + "step": 28525 + }, + { + "entropy": 0.027742599264747696, + "epoch": 2.5176480962984193, + "grad_norm": 2.640625, + "learning_rate": 1.3260535567439215e-05, + "loss": 0.027226037979125976, + "mean_token_accuracy": 0.9915308326482772, + "num_tokens": 30145291.0, + "step": 28550 + }, + { + "entropy": 0.029665047602356935, + "epoch": 2.519852730439384, + "grad_norm": 2.03125, + "learning_rate": 1.3142344277104169e-05, + "loss": 0.02815444231033325, + "mean_token_accuracy": 0.9921803346276283, + "num_tokens": 30172389.0, + "step": 28575 + }, + { + "entropy": 0.03133596465810115, + "epoch": 2.522057364580348, + "grad_norm": 1.1015625, + "learning_rate": 1.3024645010560133e-05, + "loss": 0.03243262767791748, + "mean_token_accuracy": 0.9912997674942017, + "num_tokens": 30200862.0, + "step": 28600 + }, + { + "entropy": 0.02291832109705865, + "epoch": 2.524261998721312, + "grad_norm": 1.2109375, + "learning_rate": 1.290743843453982e-05, + "loss": 0.028611266613006593, + "mean_token_accuracy": 0.992739563882351, + "num_tokens": 30225486.0, + "step": 28625 + }, + { + "entropy": 0.024033257207702264, + "epoch": 2.5264666328622765, + "grad_norm": 1.1171875, + "learning_rate": 1.2790725212984945e-05, + "loss": 0.028946173191070557, + "mean_token_accuracy": 0.9921584859490394, + "num_tokens": 30252052.0, + "step": 28650 + }, + { + "entropy": 0.03086577148964352, + "epoch": 2.528671267003241, + "grad_norm": 0.62109375, + "learning_rate": 1.2674506007042542e-05, + "loss": 0.04069224357604981, + "mean_token_accuracy": 0.9903390213847161, + "num_tokens": 30279085.0, + "step": 28675 + }, + { + "entropy": 0.02383694459356775, + "epoch": 2.530875901144205, + "grad_norm": 1.2890625, + "learning_rate": 1.255878147506122e-05, + "loss": 0.029977023601531982, + "mean_token_accuracy": 0.991557080745697, + "num_tokens": 30305409.0, + "step": 28700 + }, + { + "entropy": 0.027153952862045118, + "epoch": 2.5330805352851695, + "grad_norm": 0.69140625, + "learning_rate": 1.2443552272587377e-05, + "loss": 0.03152206897735596, + "mean_token_accuracy": 0.9924914839863778, + "num_tokens": 30332015.0, + "step": 28725 + }, + { + "entropy": 0.025579888974643836, + "epoch": 2.5352851694261336, + "grad_norm": 0.87109375, + "learning_rate": 1.2328819052361495e-05, + "loss": 0.028663196563720704, + "mean_token_accuracy": 0.992575915157795, + "num_tokens": 30359014.0, + "step": 28750 + }, + { + "entropy": 0.02768795941843564, + "epoch": 2.537489803567098, + "grad_norm": 0.1943359375, + "learning_rate": 1.2214582464314428e-05, + "loss": 0.02344026803970337, + "mean_token_accuracy": 0.994471182525158, + "num_tokens": 30385317.0, + "step": 28775 + }, + { + "entropy": 0.022483936410135356, + "epoch": 2.5396944377080626, + "grad_norm": 2.125, + "learning_rate": 1.2100843155563845e-05, + "loss": 0.02409367561340332, + "mean_token_accuracy": 0.9890817698836326, + "num_tokens": 30411445.0, + "step": 28800 + }, + { + "entropy": 0.0211687412471656, + "epoch": 2.5418990718490266, + "grad_norm": 0.4609375, + "learning_rate": 1.1987601770410406e-05, + "loss": 0.022674081325531007, + "mean_token_accuracy": 0.9933211871981621, + "num_tokens": 30436622.0, + "step": 28825 + }, + { + "entropy": 0.024043053868699645, + "epoch": 2.5441037059899907, + "grad_norm": 1.15625, + "learning_rate": 1.1874858950334167e-05, + "loss": 0.025228850841522217, + "mean_token_accuracy": 0.9945827552676201, + "num_tokens": 30462046.0, + "step": 28850 + }, + { + "entropy": 0.026052604026444895, + "epoch": 2.546308340130955, + "grad_norm": 0.62109375, + "learning_rate": 1.1762615333990989e-05, + "loss": 0.03171946048736572, + "mean_token_accuracy": 0.9917865255475045, + "num_tokens": 30488101.0, + "step": 28875 + }, + { + "entropy": 0.030083065580874972, + "epoch": 2.5485129742719197, + "grad_norm": 0.71875, + "learning_rate": 1.1650871557208876e-05, + "loss": 0.03770236015319824, + "mean_token_accuracy": 0.9915402907133103, + "num_tokens": 30515183.0, + "step": 28900 + }, + { + "entropy": 0.022853468903722387, + "epoch": 2.5507176084128838, + "grad_norm": 1.2421875, + "learning_rate": 1.1539628252984403e-05, + "loss": 0.02523197889328003, + "mean_token_accuracy": 0.9935995003581047, + "num_tokens": 30540057.0, + "step": 28925 + }, + { + "entropy": 0.029179727130540414, + "epoch": 2.5529222425538483, + "grad_norm": 0.337890625, + "learning_rate": 1.1428886051479037e-05, + "loss": 0.03404136896133423, + "mean_token_accuracy": 0.9906327998638154, + "num_tokens": 30566287.0, + "step": 28950 + }, + { + "entropy": 0.02455010826686703, + "epoch": 2.5551268766948123, + "grad_norm": 0.88671875, + "learning_rate": 1.1318645580015752e-05, + "loss": 0.03312621593475342, + "mean_token_accuracy": 0.992313320338726, + "num_tokens": 30591905.0, + "step": 28975 + }, + { + "entropy": 0.024439287170152965, + "epoch": 2.557331510835777, + "grad_norm": 0.91796875, + "learning_rate": 1.1208907463075246e-05, + "loss": 0.02883162260055542, + "mean_token_accuracy": 0.9913711741566658, + "num_tokens": 30617761.0, + "step": 29000 + }, + { + "epoch": 2.557331510835777, + "eval_entropy": 0.01511552784521922, + "eval_loss": 0.024238504469394684, + "eval_mean_token_accuracy": 0.9927981172670304, + "eval_num_tokens": 30617761.0, + "eval_runtime": 227.0085, + "eval_samples_per_second": 17.308, + "eval_steps_per_second": 4.33, + "step": 29000 + }, + { + "entropy": 0.028338864904362708, + "epoch": 2.5595361449767413, + "grad_norm": 0.41796875, + "learning_rate": 1.1099672322292621e-05, + "loss": 0.04463479518890381, + "mean_token_accuracy": 0.9912523052096367, + "num_tokens": 30643640.0, + "step": 29025 + }, + { + "entropy": 0.027401521958090598, + "epoch": 2.5617407791177054, + "grad_norm": 0.97265625, + "learning_rate": 1.0990940776453728e-05, + "loss": 0.0312890076637268, + "mean_token_accuracy": 0.9916872721910477, + "num_tokens": 30669460.0, + "step": 29050 + }, + { + "entropy": 0.02932647341722259, + "epoch": 2.56394541325867, + "grad_norm": 2.34375, + "learning_rate": 1.0882713441491653e-05, + "loss": 0.03259019613265991, + "mean_token_accuracy": 0.9919594395160675, + "num_tokens": 30695879.0, + "step": 29075 + }, + { + "entropy": 0.02201111441543617, + "epoch": 2.566150047399634, + "grad_norm": 1.0, + "learning_rate": 1.0774990930483354e-05, + "loss": 0.03443076848983764, + "mean_token_accuracy": 0.9926259750127793, + "num_tokens": 30721398.0, + "step": 29100 + }, + { + "entropy": 0.031130029212763474, + "epoch": 2.5683546815405984, + "grad_norm": 2.703125, + "learning_rate": 1.0667773853646034e-05, + "loss": 0.05192684173583984, + "mean_token_accuracy": 0.9869232523441315, + "num_tokens": 30750109.0, + "step": 29125 + }, + { + "entropy": 0.02571553880090505, + "epoch": 2.5705593156815625, + "grad_norm": 0.9453125, + "learning_rate": 1.0561062818333822e-05, + "loss": 0.029610633850097656, + "mean_token_accuracy": 0.9921117216348648, + "num_tokens": 30776603.0, + "step": 29150 + }, + { + "entropy": 0.026652264393324004, + "epoch": 2.572763949822527, + "grad_norm": 0.71875, + "learning_rate": 1.0454858429034187e-05, + "loss": 0.03440707683563232, + "mean_token_accuracy": 0.9905963116884231, + "num_tokens": 30802740.0, + "step": 29175 + }, + { + "entropy": 0.024566626491359784, + "epoch": 2.574968583963491, + "grad_norm": 1.015625, + "learning_rate": 1.0349161287364673e-05, + "loss": 0.028398797512054444, + "mean_token_accuracy": 0.992349998652935, + "num_tokens": 30829022.0, + "step": 29200 + }, + { + "entropy": 0.03593797193334467, + "epoch": 2.5771732181044555, + "grad_norm": 0.70703125, + "learning_rate": 1.0243971992069334e-05, + "loss": 0.05232769966125488, + "mean_token_accuracy": 0.9877561151981353, + "num_tokens": 30856036.0, + "step": 29225 + }, + { + "entropy": 0.030079898621734172, + "epoch": 2.57937785224542, + "grad_norm": 0.115234375, + "learning_rate": 1.0139291139015495e-05, + "loss": 0.03709140777587891, + "mean_token_accuracy": 0.9910090592503548, + "num_tokens": 30883425.0, + "step": 29250 + }, + { + "entropy": 0.029405801608154435, + "epoch": 2.581582486386384, + "grad_norm": 0.048095703125, + "learning_rate": 1.0035119321190233e-05, + "loss": 0.031949334144592285, + "mean_token_accuracy": 0.991183122098446, + "num_tokens": 30909134.0, + "step": 29275 + }, + { + "entropy": 0.028425286148940358, + "epoch": 2.5837871205273486, + "grad_norm": 1.6953125, + "learning_rate": 9.931457128697131e-06, + "loss": 0.0320594334602356, + "mean_token_accuracy": 0.9920493552088737, + "num_tokens": 30936305.0, + "step": 29300 + }, + { + "entropy": 0.027195464314481797, + "epoch": 2.5859917546683127, + "grad_norm": 0.625, + "learning_rate": 9.828305148752881e-06, + "loss": 0.032163431644439695, + "mean_token_accuracy": 0.9921726885437966, + "num_tokens": 30963031.0, + "step": 29325 + }, + { + "entropy": 0.027875148742787133, + "epoch": 2.588196388809277, + "grad_norm": 0.63671875, + "learning_rate": 9.725663965683984e-06, + "loss": 0.03857534646987915, + "mean_token_accuracy": 0.9911905533075333, + "num_tokens": 30990558.0, + "step": 29350 + }, + { + "entropy": 0.026536175619912682, + "epoch": 2.5904010229502417, + "grad_norm": 0.98046875, + "learning_rate": 9.623534160923353e-06, + "loss": 0.028181934356689455, + "mean_token_accuracy": 0.9915832805633545, + "num_tokens": 31016748.0, + "step": 29375 + }, + { + "entropy": 0.028007027512248896, + "epoch": 2.5926056570912057, + "grad_norm": 1.2578125, + "learning_rate": 9.521916313007173e-06, + "loss": 0.035958237648010254, + "mean_token_accuracy": 0.9915932086110115, + "num_tokens": 31044073.0, + "step": 29400 + }, + { + "entropy": 0.02840741300540685, + "epoch": 2.5948102912321698, + "grad_norm": 1.7109375, + "learning_rate": 9.420810997571516e-06, + "loss": 0.03402619361877442, + "mean_token_accuracy": 0.9930353647470475, + "num_tokens": 31070655.0, + "step": 29425 + }, + { + "entropy": 0.03006437236908823, + "epoch": 2.5970149253731343, + "grad_norm": 1.0, + "learning_rate": 9.320218787349066e-06, + "loss": 0.043330874443054196, + "mean_token_accuracy": 0.9889759740233421, + "num_tokens": 31097111.0, + "step": 29450 + }, + { + "entropy": 0.03026204977832094, + "epoch": 2.5992195595140988, + "grad_norm": 1.46875, + "learning_rate": 9.220140252165931e-06, + "loss": 0.03800657749176026, + "mean_token_accuracy": 0.9887870910763741, + "num_tokens": 31124102.0, + "step": 29475 + }, + { + "entropy": 0.028786359129080664, + "epoch": 2.601424193655063, + "grad_norm": 2.453125, + "learning_rate": 9.120575958938416e-06, + "loss": 0.02739781141281128, + "mean_token_accuracy": 0.9923294255137444, + "num_tokens": 31150596.0, + "step": 29500 + }, + { + "entropy": 0.02446969037204326, + "epoch": 2.6036288277960273, + "grad_norm": 1.546875, + "learning_rate": 9.021526471669783e-06, + "loss": 0.025637152194976805, + "mean_token_accuracy": 0.9926092553138733, + "num_tokens": 31177647.0, + "step": 29525 + }, + { + "entropy": 0.02096207481219608, + "epoch": 2.6058334619369914, + "grad_norm": 1.0078125, + "learning_rate": 8.922992351447079e-06, + "loss": 0.025728282928466798, + "mean_token_accuracy": 0.9925812685489654, + "num_tokens": 31202611.0, + "step": 29550 + }, + { + "entropy": 0.03069892433763016, + "epoch": 2.608038096077956, + "grad_norm": 1.0625, + "learning_rate": 8.824974156437903e-06, + "loss": 0.029011659622192383, + "mean_token_accuracy": 0.9920935586094857, + "num_tokens": 31227743.0, + "step": 29575 + }, + { + "entropy": 0.02603791271118098, + "epoch": 2.6102427302189204, + "grad_norm": 2.9375, + "learning_rate": 8.72747244188733e-06, + "loss": 0.031118543148040773, + "mean_token_accuracy": 0.9923553898930549, + "num_tokens": 31254607.0, + "step": 29600 + }, + { + "entropy": 0.02094233050909679, + "epoch": 2.6124473643598844, + "grad_norm": 0.84765625, + "learning_rate": 8.630487760114703e-06, + "loss": 0.020793261528015135, + "mean_token_accuracy": 0.9937394097447395, + "num_tokens": 31279142.0, + "step": 29625 + }, + { + "entropy": 0.027775917933613527, + "epoch": 2.614651998500849, + "grad_norm": 1.9453125, + "learning_rate": 8.5340206605105e-06, + "loss": 0.03821009874343872, + "mean_token_accuracy": 0.9896869486570359, + "num_tokens": 31304261.0, + "step": 29650 + }, + { + "entropy": 0.029270872360466455, + "epoch": 2.616856632641813, + "grad_norm": 0.208984375, + "learning_rate": 8.438071689533288e-06, + "loss": 0.03347602367401123, + "mean_token_accuracy": 0.9911593902111053, + "num_tokens": 31330018.0, + "step": 29675 + }, + { + "entropy": 0.02950705551649662, + "epoch": 2.6190612667827775, + "grad_norm": 0.291015625, + "learning_rate": 8.34264139070653e-06, + "loss": 0.046378650665283204, + "mean_token_accuracy": 0.9907181602716446, + "num_tokens": 31356650.0, + "step": 29700 + }, + { + "entropy": 0.029709462454957248, + "epoch": 2.6212659009237416, + "grad_norm": 1.2109375, + "learning_rate": 8.247730304615609e-06, + "loss": 0.04107787609100342, + "mean_token_accuracy": 0.9899954950809479, + "num_tokens": 31383748.0, + "step": 29725 + }, + { + "entropy": 0.031126942403589056, + "epoch": 2.623470535064706, + "grad_norm": 0.62109375, + "learning_rate": 8.153338968904723e-06, + "loss": 0.03897136926651001, + "mean_token_accuracy": 0.9887469747662544, + "num_tokens": 31410939.0, + "step": 29750 + }, + { + "entropy": 0.02810529593278261, + "epoch": 2.62567516920567, + "grad_norm": 0.69921875, + "learning_rate": 8.059467918273756e-06, + "loss": 0.02849973678588867, + "mean_token_accuracy": 0.9866181939840317, + "num_tokens": 31438643.0, + "step": 29775 + }, + { + "entropy": 0.027220633971937788, + "epoch": 2.6278798033466346, + "grad_norm": 1.2734375, + "learning_rate": 7.966117684475393e-06, + "loss": 0.039316484928131105, + "mean_token_accuracy": 0.9915398034453392, + "num_tokens": 31466576.0, + "step": 29800 + }, + { + "entropy": 0.02613406105301692, + "epoch": 2.630084437487599, + "grad_norm": 0.859375, + "learning_rate": 7.87328879631204e-06, + "loss": 0.027877295017242433, + "mean_token_accuracy": 0.9921690738201141, + "num_tokens": 31494426.0, + "step": 29825 + }, + { + "entropy": 0.029777123154854052, + "epoch": 2.632289071628563, + "grad_norm": 0.8671875, + "learning_rate": 7.780981779632823e-06, + "loss": 0.03954046487808228, + "mean_token_accuracy": 0.9905496680736542, + "num_tokens": 31520790.0, + "step": 29850 + }, + { + "entropy": 0.031225387039357885, + "epoch": 2.6344937057695277, + "grad_norm": 0.9609375, + "learning_rate": 7.689197157330564e-06, + "loss": 0.046720662117004395, + "mean_token_accuracy": 0.9887133800983429, + "num_tokens": 31546939.0, + "step": 29875 + }, + { + "entropy": 0.02381926384405233, + "epoch": 2.6366983399104917, + "grad_norm": 1.5234375, + "learning_rate": 7.597935449338944e-06, + "loss": 0.03087226867675781, + "mean_token_accuracy": 0.9920753428339958, + "num_tokens": 31572240.0, + "step": 29900 + }, + { + "entropy": 0.03250588172009884, + "epoch": 2.6389029740514562, + "grad_norm": 2.375, + "learning_rate": 7.507197172629432e-06, + "loss": 0.04254406452178955, + "mean_token_accuracy": 0.990644511282444, + "num_tokens": 31598840.0, + "step": 29925 + }, + { + "entropy": 0.031087516101815708, + "epoch": 2.6411076081924207, + "grad_norm": 1.09375, + "learning_rate": 7.416982841208453e-06, + "loss": 0.042113037109375, + "mean_token_accuracy": 0.987711393237114, + "num_tokens": 31626112.0, + "step": 29950 + }, + { + "entropy": 0.0267231866216207, + "epoch": 2.643312242333385, + "grad_norm": 0.55078125, + "learning_rate": 7.327292966114374e-06, + "loss": 0.026836345195770262, + "mean_token_accuracy": 0.9923613077402115, + "num_tokens": 31651914.0, + "step": 29975 + }, + { + "entropy": 0.026771614199224133, + "epoch": 2.645516876474349, + "grad_norm": 0.91796875, + "learning_rate": 7.238128055414706e-06, + "loss": 0.026218461990356445, + "mean_token_accuracy": 0.9917361649870873, + "num_tokens": 31678089.0, + "step": 30000 + }, + { + "epoch": 2.645516876474349, + "eval_entropy": 0.015163507781158242, + "eval_loss": 0.02418082021176815, + "eval_mean_token_accuracy": 0.9927960444442617, + "eval_num_tokens": 31678089.0, + "eval_runtime": 245.6766, + "eval_samples_per_second": 15.993, + "eval_steps_per_second": 4.001, + "step": 30000 + }, + { + "entropy": 0.030485192148407805, + "epoch": 2.6477215106153134, + "grad_norm": 2.734375, + "learning_rate": 7.1494886142031925e-06, + "loss": 0.03966634750366211, + "mean_token_accuracy": 0.9914444166421891, + "num_tokens": 31704822.0, + "step": 30025 + }, + { + "entropy": 0.02233901047420659, + "epoch": 2.649926144756278, + "grad_norm": 0.80078125, + "learning_rate": 7.0613751445969265e-06, + "loss": 0.023667454719543457, + "mean_token_accuracy": 0.993430680334568, + "num_tokens": 31730622.0, + "step": 30050 + }, + { + "entropy": 0.024482036822882948, + "epoch": 2.652130778897242, + "grad_norm": 0.67578125, + "learning_rate": 6.9737881457335506e-06, + "loss": 0.02852684736251831, + "mean_token_accuracy": 0.991863000690937, + "num_tokens": 31756387.0, + "step": 30075 + }, + { + "entropy": 0.027239963827469182, + "epoch": 2.6543354130382064, + "grad_norm": 1.0546875, + "learning_rate": 6.886728113768337e-06, + "loss": 0.03446184396743775, + "mean_token_accuracy": 0.9912598931789398, + "num_tokens": 31782098.0, + "step": 30100 + }, + { + "entropy": 0.02890480452515476, + "epoch": 2.6565400471791705, + "grad_norm": 1.0078125, + "learning_rate": 6.8001955418714905e-06, + "loss": 0.036803393363952636, + "mean_token_accuracy": 0.9865454795956612, + "num_tokens": 31810547.0, + "step": 30125 + }, + { + "entropy": 0.029436326650466072, + "epoch": 2.658744681320135, + "grad_norm": 1.0546875, + "learning_rate": 6.714190920225283e-06, + "loss": 0.03758874416351318, + "mean_token_accuracy": 0.9899130940437317, + "num_tokens": 31838640.0, + "step": 30150 + }, + { + "entropy": 0.027448534051000024, + "epoch": 2.6609493154610995, + "grad_norm": 1.71875, + "learning_rate": 6.628714736021358e-06, + "loss": 0.031783337593078616, + "mean_token_accuracy": 0.9929012563824654, + "num_tokens": 31863958.0, + "step": 30175 + }, + { + "entropy": 0.023979724997479933, + "epoch": 2.6631539496020635, + "grad_norm": 2.046875, + "learning_rate": 6.543767473457807e-06, + "loss": 0.026163532733917236, + "mean_token_accuracy": 0.9927327701449394, + "num_tokens": 31890174.0, + "step": 30200 + }, + { + "entropy": 0.028794102249594288, + "epoch": 2.6653585837430276, + "grad_norm": 0.470703125, + "learning_rate": 6.45934961373662e-06, + "loss": 0.03421214580535889, + "mean_token_accuracy": 0.9915524750947953, + "num_tokens": 31915944.0, + "step": 30225 + }, + { + "entropy": 0.0330623129722153, + "epoch": 2.667563217883992, + "grad_norm": 2.796875, + "learning_rate": 6.3754616350608424e-06, + "loss": 0.047150373458862305, + "mean_token_accuracy": 0.9889357024431229, + "num_tokens": 31942686.0, + "step": 30250 + }, + { + "entropy": 0.02424276659203315, + "epoch": 2.6697678520249566, + "grad_norm": 0.3671875, + "learning_rate": 6.292104012631905e-06, + "loss": 0.034151678085327146, + "mean_token_accuracy": 0.991973640024662, + "num_tokens": 31968763.0, + "step": 30275 + }, + { + "entropy": 0.022598919857009604, + "epoch": 2.6719724861659206, + "grad_norm": 0.345703125, + "learning_rate": 6.20927721864687e-06, + "loss": 0.024938690662384033, + "mean_token_accuracy": 0.9939743283390999, + "num_tokens": 31995639.0, + "step": 30300 + }, + { + "entropy": 0.027603573197920922, + "epoch": 2.674177120306885, + "grad_norm": 0.75390625, + "learning_rate": 6.1269817222958995e-06, + "loss": 0.0316735577583313, + "mean_token_accuracy": 0.9919213426113128, + "num_tokens": 32021121.0, + "step": 30325 + }, + { + "entropy": 0.029339179949456593, + "epoch": 2.676381754447849, + "grad_norm": 0.8515625, + "learning_rate": 6.045217989759411e-06, + "loss": 0.036246950626373294, + "mean_token_accuracy": 0.9915802147984505, + "num_tokens": 32048113.0, + "step": 30350 + }, + { + "entropy": 0.02590386390213098, + "epoch": 2.6785863885888137, + "grad_norm": 1.484375, + "learning_rate": 5.9639864842056084e-06, + "loss": 0.026949644088745117, + "mean_token_accuracy": 0.9931145197153092, + "num_tokens": 32074183.0, + "step": 30375 + }, + { + "entropy": 0.031569371115929244, + "epoch": 2.680791022729778, + "grad_norm": 1.5859375, + "learning_rate": 5.883287665787729e-06, + "loss": 0.03619576454162598, + "mean_token_accuracy": 0.9898512217402459, + "num_tokens": 32100552.0, + "step": 30400 + }, + { + "entropy": 0.03282993205477396, + "epoch": 2.6829956568707423, + "grad_norm": 3.84375, + "learning_rate": 5.803121991641513e-06, + "loss": 0.041049656867980955, + "mean_token_accuracy": 0.9897631332278252, + "num_tokens": 32127816.0, + "step": 30425 + }, + { + "entropy": 0.030889862734293273, + "epoch": 2.6852002910117068, + "grad_norm": 0.76953125, + "learning_rate": 5.723489915882607e-06, + "loss": 0.029335715770721436, + "mean_token_accuracy": 0.992390308380127, + "num_tokens": 32155551.0, + "step": 30450 + }, + { + "entropy": 0.030821244430044317, + "epoch": 2.687404925152671, + "grad_norm": 1.4453125, + "learning_rate": 5.644391889603951e-06, + "loss": 0.03456997871398926, + "mean_token_accuracy": 0.9896837142109871, + "num_tokens": 32182672.0, + "step": 30475 + }, + { + "entropy": 0.022968131275520137, + "epoch": 2.6896095592936353, + "grad_norm": 1.0078125, + "learning_rate": 5.565828360873226e-06, + "loss": 0.03449763298034668, + "mean_token_accuracy": 0.9907573989033699, + "num_tokens": 32210056.0, + "step": 30500 + }, + { + "entropy": 0.021756964745400183, + "epoch": 2.6918141934345994, + "grad_norm": 0.80078125, + "learning_rate": 5.487799774730395e-06, + "loss": 0.02377518653869629, + "mean_token_accuracy": 0.9940974757075309, + "num_tokens": 32235991.0, + "step": 30525 + }, + { + "entropy": 0.02951632114707536, + "epoch": 2.694018827575564, + "grad_norm": 1.734375, + "learning_rate": 5.410306573185109e-06, + "loss": 0.03300446033477783, + "mean_token_accuracy": 0.9902588561177254, + "num_tokens": 32262850.0, + "step": 30550 + }, + { + "entropy": 0.02724777233459463, + "epoch": 2.696223461716528, + "grad_norm": 0.65234375, + "learning_rate": 5.333349195214166e-06, + "loss": 0.03714723348617554, + "mean_token_accuracy": 0.9866947430372238, + "num_tokens": 32290242.0, + "step": 30575 + }, + { + "entropy": 0.02796483456815622, + "epoch": 2.6984280958574924, + "grad_norm": 0.16015625, + "learning_rate": 5.256928076759115e-06, + "loss": 0.033420143127441404, + "mean_token_accuracy": 0.9920667752623558, + "num_tokens": 32315628.0, + "step": 30600 + }, + { + "entropy": 0.02718132355395937, + "epoch": 2.700632729998457, + "grad_norm": 1.21875, + "learning_rate": 5.181043650723761e-06, + "loss": 0.03775918960571289, + "mean_token_accuracy": 0.990137320458889, + "num_tokens": 32341921.0, + "step": 30625 + }, + { + "entropy": 0.02674400442670958, + "epoch": 2.702837364139421, + "grad_norm": 0.58984375, + "learning_rate": 5.105696346971678e-06, + "loss": 0.032031383514404294, + "mean_token_accuracy": 0.9912699130177498, + "num_tokens": 32368912.0, + "step": 30650 + }, + { + "entropy": 0.03207970959640079, + "epoch": 2.7050419982803855, + "grad_norm": 0.96484375, + "learning_rate": 5.030886592323825e-06, + "loss": 0.03718382120132446, + "mean_token_accuracy": 0.9903065833449364, + "num_tokens": 32396094.0, + "step": 30675 + }, + { + "entropy": 0.031730443536853274, + "epoch": 2.7072466324213496, + "grad_norm": 0.55859375, + "learning_rate": 4.956614810556038e-06, + "loss": 0.04166868686676026, + "mean_token_accuracy": 0.9892936205863953, + "num_tokens": 32422076.0, + "step": 30700 + }, + { + "entropy": 0.029964850165561073, + "epoch": 2.709451266562314, + "grad_norm": 0.69921875, + "learning_rate": 4.882881422396768e-06, + "loss": 0.02965547323226929, + "mean_token_accuracy": 0.992727922797203, + "num_tokens": 32448492.0, + "step": 30725 + }, + { + "entropy": 0.025802147214999422, + "epoch": 2.7116559007032786, + "grad_norm": 0.73828125, + "learning_rate": 4.809686845524553e-06, + "loss": 0.028469092845916748, + "mean_token_accuracy": 0.986219149529934, + "num_tokens": 32475202.0, + "step": 30750 + }, + { + "entropy": 0.02441559437660544, + "epoch": 2.7138605348442426, + "grad_norm": 0.54296875, + "learning_rate": 4.73703149456578e-06, + "loss": 0.024286515712738037, + "mean_token_accuracy": 0.9930135196447373, + "num_tokens": 32500990.0, + "step": 30775 + }, + { + "entropy": 0.02470657176872919, + "epoch": 2.7160651689852067, + "grad_norm": 0.125, + "learning_rate": 4.664915781092227e-06, + "loss": 0.027780747413635253, + "mean_token_accuracy": 0.9930829498171806, + "num_tokens": 32527121.0, + "step": 30800 + }, + { + "entropy": 0.02191462902490457, + "epoch": 2.718269803126171, + "grad_norm": 2.015625, + "learning_rate": 4.593340113618783e-06, + "loss": 0.028052380084991457, + "mean_token_accuracy": 0.9934657236933708, + "num_tokens": 32554159.0, + "step": 30825 + }, + { + "entropy": 0.030870509027117805, + "epoch": 2.7204744372671357, + "grad_norm": 1.0546875, + "learning_rate": 4.522304897601149e-06, + "loss": 0.03652215957641602, + "mean_token_accuracy": 0.9900946572422982, + "num_tokens": 32582393.0, + "step": 30850 + }, + { + "entropy": 0.02534176224733528, + "epoch": 2.7226790714080997, + "grad_norm": 1.8359375, + "learning_rate": 4.451810535433532e-06, + "loss": 0.034189610481262206, + "mean_token_accuracy": 0.9922301995754242, + "num_tokens": 32608359.0, + "step": 30875 + }, + { + "entropy": 0.022799009535629012, + "epoch": 2.7248837055490642, + "grad_norm": 0.875, + "learning_rate": 4.381857426446323e-06, + "loss": 0.0269643759727478, + "mean_token_accuracy": 0.992847872376442, + "num_tokens": 32633993.0, + "step": 30900 + }, + { + "entropy": 0.025321687431678584, + "epoch": 2.7270883396900283, + "grad_norm": 1.5625, + "learning_rate": 4.312445966903911e-06, + "loss": 0.033355269432067874, + "mean_token_accuracy": 0.9916208279132843, + "num_tokens": 32660746.0, + "step": 30925 + }, + { + "entropy": 0.02630714648788853, + "epoch": 2.729292973830993, + "grad_norm": 1.28125, + "learning_rate": 4.243576550002348e-06, + "loss": 0.03572848796844483, + "mean_token_accuracy": 0.9908462983369827, + "num_tokens": 32687066.0, + "step": 30950 + }, + { + "entropy": 0.02801416463107671, + "epoch": 2.7314976079719573, + "grad_norm": 1.9765625, + "learning_rate": 4.175249565867212e-06, + "loss": 0.03111718416213989, + "mean_token_accuracy": 0.9914992889761924, + "num_tokens": 32714187.0, + "step": 30975 + }, + { + "entropy": 0.025514190020621753, + "epoch": 2.7337022421129213, + "grad_norm": 1.65625, + "learning_rate": 4.107465401551347e-06, + "loss": 0.02301668405532837, + "mean_token_accuracy": 0.9932084521651268, + "num_tokens": 32739302.0, + "step": 31000 + }, + { + "epoch": 2.7337022421129213, + "eval_entropy": 0.015138276679110206, + "eval_loss": 0.024185428395867348, + "eval_mean_token_accuracy": 0.9928067592128975, + "eval_num_tokens": 32739302.0, + "eval_runtime": 243.8232, + "eval_samples_per_second": 16.114, + "eval_steps_per_second": 4.032, + "step": 31000 + }, + { + "entropy": 0.02361316845028341, + "epoch": 2.7359068762538854, + "grad_norm": 1.0234375, + "learning_rate": 4.040224441032647e-06, + "loss": 0.02711350440979004, + "mean_token_accuracy": 0.9928304886817932, + "num_tokens": 32764350.0, + "step": 31025 + }, + { + "entropy": 0.0276046938167201, + "epoch": 2.73811151039485, + "grad_norm": 2.421875, + "learning_rate": 3.973527065211924e-06, + "loss": 0.034292078018188475, + "mean_token_accuracy": 0.9921803194284439, + "num_tokens": 32790544.0, + "step": 31050 + }, + { + "entropy": 0.02835739084504894, + "epoch": 2.7403161445358144, + "grad_norm": 0.796875, + "learning_rate": 3.9073736519107615e-06, + "loss": 0.030812058448791504, + "mean_token_accuracy": 0.9924408486485481, + "num_tokens": 32817371.0, + "step": 31075 + }, + { + "entropy": 0.026140070521105372, + "epoch": 2.7425207786767785, + "grad_norm": 0.287109375, + "learning_rate": 3.841764575869356e-06, + "loss": 0.03151321649551392, + "mean_token_accuracy": 0.9925504148006439, + "num_tokens": 32844316.0, + "step": 31100 + }, + { + "entropy": 0.021448419220723734, + "epoch": 2.744725412817743, + "grad_norm": 1.6796875, + "learning_rate": 3.7767002087443214e-06, + "loss": 0.027071454524993897, + "mean_token_accuracy": 0.9938805449008942, + "num_tokens": 32869921.0, + "step": 31125 + }, + { + "entropy": 0.027382947859623526, + "epoch": 2.746930046958707, + "grad_norm": 2.03125, + "learning_rate": 3.7121809191067225e-06, + "loss": 0.034256911277770995, + "mean_token_accuracy": 0.9918771433830261, + "num_tokens": 32895789.0, + "step": 31150 + }, + { + "entropy": 0.025478750003985626, + "epoch": 2.7491346810996715, + "grad_norm": 2.65625, + "learning_rate": 3.6482070724399022e-06, + "loss": 0.036122303009033206, + "mean_token_accuracy": 0.9917540955543518, + "num_tokens": 32921732.0, + "step": 31175 + }, + { + "entropy": 0.030465085982468734, + "epoch": 2.751339315240636, + "grad_norm": 0.265625, + "learning_rate": 3.5847790311374085e-06, + "loss": 0.035727314949035645, + "mean_token_accuracy": 0.9871862959861756, + "num_tokens": 32949378.0, + "step": 31200 + }, + { + "entropy": 0.02002370489099121, + "epoch": 2.7535439493816, + "grad_norm": 1.0546875, + "learning_rate": 3.521897154500953e-06, + "loss": 0.016289963722229003, + "mean_token_accuracy": 0.9956188550591469, + "num_tokens": 32975624.0, + "step": 31225 + }, + { + "entropy": 0.030022653135565635, + "epoch": 2.7557485835225646, + "grad_norm": 1.390625, + "learning_rate": 3.4595617987384086e-06, + "loss": 0.034364793300628665, + "mean_token_accuracy": 0.9914705500006675, + "num_tokens": 33002241.0, + "step": 31250 + }, + { + "entropy": 0.02831969540686259, + "epoch": 2.7579532176635286, + "grad_norm": 0.271484375, + "learning_rate": 3.3977733169617276e-06, + "loss": 0.03397629976272583, + "mean_token_accuracy": 0.9931815955042839, + "num_tokens": 33029769.0, + "step": 31275 + }, + { + "entropy": 0.024141516995041457, + "epoch": 2.760157851804493, + "grad_norm": 0.55078125, + "learning_rate": 3.336532059185016e-06, + "loss": 0.020577189922332765, + "mean_token_accuracy": 0.9933001267910003, + "num_tokens": 33054886.0, + "step": 31300 + }, + { + "entropy": 0.028197366523891106, + "epoch": 2.762362485945457, + "grad_norm": 3.171875, + "learning_rate": 3.275838372322482e-06, + "loss": 0.0250828218460083, + "mean_token_accuracy": 0.99329480946064, + "num_tokens": 33082486.0, + "step": 31325 + }, + { + "entropy": 0.025448178426740922, + "epoch": 2.7645671200864217, + "grad_norm": 0.271484375, + "learning_rate": 3.215692600186504e-06, + "loss": 0.02443751573562622, + "mean_token_accuracy": 0.9934353199601174, + "num_tokens": 33108507.0, + "step": 31350 + }, + { + "entropy": 0.024398964844294824, + "epoch": 2.7667717542273857, + "grad_norm": 0.63671875, + "learning_rate": 3.1560950834856995e-06, + "loss": 0.023592844009399414, + "mean_token_accuracy": 0.9932548335194588, + "num_tokens": 33135112.0, + "step": 31375 + }, + { + "entropy": 0.02761033813996619, + "epoch": 2.7689763883683502, + "grad_norm": 2.453125, + "learning_rate": 3.0970461598229806e-06, + "loss": 0.030847842693328856, + "mean_token_accuracy": 0.9923774653673172, + "num_tokens": 33161778.0, + "step": 31400 + }, + { + "entropy": 0.023629968942695995, + "epoch": 2.7711810225093148, + "grad_norm": 0.79296875, + "learning_rate": 3.0385461636935896e-06, + "loss": 0.02890580177307129, + "mean_token_accuracy": 0.9930263885855675, + "num_tokens": 33187952.0, + "step": 31425 + }, + { + "entropy": 0.031088588998973137, + "epoch": 2.773385656650279, + "grad_norm": 0.05126953125, + "learning_rate": 2.9805954264832903e-06, + "loss": 0.039121434688568116, + "mean_token_accuracy": 0.990178719162941, + "num_tokens": 33214650.0, + "step": 31450 + }, + { + "entropy": 0.025570228301403403, + "epoch": 2.7755902907912433, + "grad_norm": 1.96875, + "learning_rate": 2.9231942764664566e-06, + "loss": 0.034187076091766355, + "mean_token_accuracy": 0.9918478041887283, + "num_tokens": 33241014.0, + "step": 31475 + }, + { + "entropy": 0.027400689849127958, + "epoch": 2.7777949249322074, + "grad_norm": 1.0, + "learning_rate": 2.8663430388041977e-06, + "loss": 0.027099764347076415, + "mean_token_accuracy": 0.9917295035719872, + "num_tokens": 33268017.0, + "step": 31500 + }, + { + "entropy": 0.03041357980109751, + "epoch": 2.779999559073172, + "grad_norm": 0.1142578125, + "learning_rate": 2.8100420355424927e-06, + "loss": 0.03652872085571289, + "mean_token_accuracy": 0.9906187650561332, + "num_tokens": 33295932.0, + "step": 31525 + }, + { + "entropy": 0.023346892858407953, + "epoch": 2.7822041932141364, + "grad_norm": 1.2890625, + "learning_rate": 2.754291585610458e-06, + "loss": 0.02764610290527344, + "mean_token_accuracy": 0.9924876546859741, + "num_tokens": 33321774.0, + "step": 31550 + }, + { + "entropy": 0.027840503481929773, + "epoch": 2.7844088273551004, + "grad_norm": 0.1171875, + "learning_rate": 2.699092004818449e-06, + "loss": 0.03487974882125854, + "mean_token_accuracy": 0.9911468136310577, + "num_tokens": 33347870.0, + "step": 31575 + }, + { + "entropy": 0.02395780631508387, + "epoch": 2.7866134614960645, + "grad_norm": 0.3984375, + "learning_rate": 2.6444436058563284e-06, + "loss": 0.030053725242614748, + "mean_token_accuracy": 0.9926697811484337, + "num_tokens": 33373842.0, + "step": 31600 + }, + { + "entropy": 0.03379408556993439, + "epoch": 2.788818095637029, + "grad_norm": 1.34375, + "learning_rate": 2.5903466982916235e-06, + "loss": 0.040092172622680666, + "mean_token_accuracy": 0.9891455551981926, + "num_tokens": 33400897.0, + "step": 31625 + }, + { + "entropy": 0.026423628384582117, + "epoch": 2.7910227297779935, + "grad_norm": 0.54296875, + "learning_rate": 2.536801588567861e-06, + "loss": 0.030896174907684325, + "mean_token_accuracy": 0.9924877586960793, + "num_tokens": 33427390.0, + "step": 31650 + }, + { + "entropy": 0.025224947988645, + "epoch": 2.7932273639189575, + "grad_norm": 0.384765625, + "learning_rate": 2.4838085800028e-06, + "loss": 0.030030651092529295, + "mean_token_accuracy": 0.9931579613685608, + "num_tokens": 33452449.0, + "step": 31675 + }, + { + "entropy": 0.02849430440346623, + "epoch": 2.795431998059922, + "grad_norm": 2.15625, + "learning_rate": 2.431367972786669e-06, + "loss": 0.03065213918685913, + "mean_token_accuracy": 0.9911214289069176, + "num_tokens": 33480216.0, + "step": 31700 + }, + { + "entropy": 0.023187637610408273, + "epoch": 2.797636632200886, + "grad_norm": 1.0546875, + "learning_rate": 2.3794800639805326e-06, + "loss": 0.028720135688781737, + "mean_token_accuracy": 0.9924741345643997, + "num_tokens": 33505655.0, + "step": 31725 + }, + { + "entropy": 0.028415085796841596, + "epoch": 2.7998412663418506, + "grad_norm": 1.6484375, + "learning_rate": 2.3281451475145265e-06, + "loss": 0.035433664321899414, + "mean_token_accuracy": 0.9908634713292122, + "num_tokens": 33533126.0, + "step": 31750 + }, + { + "entropy": 0.024498703280369228, + "epoch": 2.802045900482815, + "grad_norm": 0.67578125, + "learning_rate": 2.2773635141863146e-06, + "loss": 0.034044148921966555, + "mean_token_accuracy": 0.9911482638120651, + "num_tokens": 33560293.0, + "step": 31775 + }, + { + "entropy": 0.022807922580341256, + "epoch": 2.804250534623779, + "grad_norm": 0.66796875, + "learning_rate": 2.2271354516593345e-06, + "loss": 0.026914541721343995, + "mean_token_accuracy": 0.9931732827425003, + "num_tokens": 33586912.0, + "step": 31800 + }, + { + "entropy": 0.028503268719650806, + "epoch": 2.8064551687647437, + "grad_norm": 1.703125, + "learning_rate": 2.1774612444611763e-06, + "loss": 0.0329655909538269, + "mean_token_accuracy": 0.9907631108164787, + "num_tokens": 33613367.0, + "step": 31825 + }, + { + "entropy": 0.03049155484219227, + "epoch": 2.8086598029057077, + "grad_norm": 0.8515625, + "learning_rate": 2.128341173982029e-06, + "loss": 0.03732057809829712, + "mean_token_accuracy": 0.9890658557415009, + "num_tokens": 33640305.0, + "step": 31850 + }, + { + "entropy": 0.023213110088054236, + "epoch": 2.810864437046672, + "grad_norm": 0.71875, + "learning_rate": 2.0797755184730703e-06, + "loss": 0.02541722059249878, + "mean_token_accuracy": 0.9929253304004669, + "num_tokens": 33665942.0, + "step": 31875 + }, + { + "entropy": 0.029790412700567686, + "epoch": 2.8130690711876363, + "grad_norm": 1.34375, + "learning_rate": 2.031764553044846e-06, + "loss": 0.033856201171875, + "mean_token_accuracy": 0.9906247487664223, + "num_tokens": 33693648.0, + "step": 31900 + }, + { + "entropy": 0.025559156673880354, + "epoch": 2.8152737053286008, + "grad_norm": 0.2041015625, + "learning_rate": 1.9843085496657363e-06, + "loss": 0.024002790451049805, + "mean_token_accuracy": 0.9935007789731025, + "num_tokens": 33719559.0, + "step": 31925 + }, + { + "entropy": 0.030051562321277744, + "epoch": 2.817478339469565, + "grad_norm": 1.1484375, + "learning_rate": 1.9374077771604717e-06, + "loss": 0.034890377521514894, + "mean_token_accuracy": 0.9901774021983146, + "num_tokens": 33745656.0, + "step": 31950 + }, + { + "entropy": 0.029569217341013428, + "epoch": 2.8196829736105293, + "grad_norm": 0.7578125, + "learning_rate": 1.8910625012084849e-06, + "loss": 0.045575013160705564, + "mean_token_accuracy": 0.9892233127355575, + "num_tokens": 33773069.0, + "step": 31975 + }, + { + "entropy": 0.0297723802717519, + "epoch": 2.821887607751494, + "grad_norm": 0.3984375, + "learning_rate": 1.8452729843425166e-06, + "loss": 0.031798024177551266, + "mean_token_accuracy": 0.9912262725830078, + "num_tokens": 33799323.0, + "step": 32000 + }, + { + "epoch": 2.821887607751494, + "eval_entropy": 0.01515091881442565, + "eval_loss": 0.024167899042367935, + "eval_mean_token_accuracy": 0.9927796902758422, + "eval_num_tokens": 33799323.0, + "eval_runtime": 224.6709, + "eval_samples_per_second": 17.488, + "eval_steps_per_second": 4.375, + "step": 32000 + }, + { + "entropy": 0.03310864323779242, + "epoch": 2.824092241892458, + "grad_norm": 1.5703125, + "learning_rate": 1.8000394859471248e-06, + "loss": 0.03980295896530151, + "mean_token_accuracy": 0.9912154677510262, + "num_tokens": 33826782.0, + "step": 32025 + }, + { + "entropy": 0.025057866033166648, + "epoch": 2.8262968760334224, + "grad_norm": 1.2109375, + "learning_rate": 1.7553622622571097e-06, + "loss": 0.027699394226074217, + "mean_token_accuracy": 0.9926194402575493, + "num_tokens": 33854056.0, + "step": 32050 + }, + { + "entropy": 0.02458085154845321, + "epoch": 2.8285015101743864, + "grad_norm": 0.18359375, + "learning_rate": 1.7112415663562032e-06, + "loss": 0.032873761653900144, + "mean_token_accuracy": 0.9916874733567238, + "num_tokens": 33879518.0, + "step": 32075 + }, + { + "entropy": 0.03164592763743713, + "epoch": 2.830706144315351, + "grad_norm": 0.90625, + "learning_rate": 1.667677648175503e-06, + "loss": 0.03423006057739258, + "mean_token_accuracy": 0.9900494894385338, + "num_tokens": 33906134.0, + "step": 32100 + }, + { + "entropy": 0.02424717899073585, + "epoch": 2.8329107784563154, + "grad_norm": 1.65625, + "learning_rate": 1.624670754492197e-06, + "loss": 0.027348561286926268, + "mean_token_accuracy": 0.9924674391746521, + "num_tokens": 33931392.0, + "step": 32125 + }, + { + "entropy": 0.024996388291965557, + "epoch": 2.8351154125972795, + "grad_norm": 2.28125, + "learning_rate": 1.5822211289280077e-06, + "loss": 0.027897491455078124, + "mean_token_accuracy": 0.9916456484794617, + "num_tokens": 33957012.0, + "step": 32150 + }, + { + "entropy": 0.02605572233511339, + "epoch": 2.8373200467382436, + "grad_norm": 2.5625, + "learning_rate": 1.5403290119479275e-06, + "loss": 0.0301747727394104, + "mean_token_accuracy": 0.9904224568605423, + "num_tokens": 33982679.0, + "step": 32175 + }, + { + "entropy": 0.025316978948030738, + "epoch": 2.839524680879208, + "grad_norm": 1.03125, + "learning_rate": 1.4989946408588419e-06, + "loss": 0.03385440587997437, + "mean_token_accuracy": 0.9931662768125534, + "num_tokens": 34009263.0, + "step": 32200 + }, + { + "entropy": 0.02666074657916397, + "epoch": 2.8417293150201726, + "grad_norm": 2.46875, + "learning_rate": 1.4582182498081521e-06, + "loss": 0.026826119422912596, + "mean_token_accuracy": 0.9928840225934983, + "num_tokens": 34035632.0, + "step": 32225 + }, + { + "entropy": 0.019749616278095346, + "epoch": 2.8439339491611366, + "grad_norm": 0.09521484375, + "learning_rate": 1.4180000697824437e-06, + "loss": 0.029020504951477052, + "mean_token_accuracy": 0.9939252683520317, + "num_tokens": 34059537.0, + "step": 32250 + }, + { + "entropy": 0.025551201240523368, + "epoch": 2.846138583302101, + "grad_norm": 1.65625, + "learning_rate": 1.378340328606198e-06, + "loss": 0.02985560417175293, + "mean_token_accuracy": 0.9924237194657326, + "num_tokens": 34085547.0, + "step": 32275 + }, + { + "entropy": 0.02937892486921555, + "epoch": 2.848343217443065, + "grad_norm": 1.9921875, + "learning_rate": 1.3392392509405383e-06, + "loss": 0.03745560646057129, + "mean_token_accuracy": 0.9910514345765113, + "num_tokens": 34111234.0, + "step": 32300 + }, + { + "entropy": 0.0265326060807638, + "epoch": 2.8505478515840297, + "grad_norm": 0.0986328125, + "learning_rate": 1.3006970582818856e-06, + "loss": 0.03232823848724365, + "mean_token_accuracy": 0.9928913420438766, + "num_tokens": 34138146.0, + "step": 32325 + }, + { + "entropy": 0.02657694019340852, + "epoch": 2.852752485724994, + "grad_norm": 1.5703125, + "learning_rate": 1.262713968960738e-06, + "loss": 0.032440063953399656, + "mean_token_accuracy": 0.9916506347060203, + "num_tokens": 34163976.0, + "step": 32350 + }, + { + "entropy": 0.02390991311860489, + "epoch": 2.8549571198659582, + "grad_norm": 0.1865234375, + "learning_rate": 1.2252901981404384e-06, + "loss": 0.028550803661346436, + "mean_token_accuracy": 0.9930156728625298, + "num_tokens": 34189547.0, + "step": 32375 + }, + { + "entropy": 0.03081292722563376, + "epoch": 2.8571617540069223, + "grad_norm": 1.7578125, + "learning_rate": 1.188425957815953e-06, + "loss": 0.040956239700317386, + "mean_token_accuracy": 0.99051443785429, + "num_tokens": 34216591.0, + "step": 32400 + }, + { + "entropy": 0.025143344738498854, + "epoch": 2.859366388147887, + "grad_norm": 0.5703125, + "learning_rate": 1.1521214568126714e-06, + "loss": 0.026441171169281005, + "mean_token_accuracy": 0.9920197981595993, + "num_tokens": 34243141.0, + "step": 32425 + }, + { + "entropy": 0.02643716373735515, + "epoch": 2.8615710222888513, + "grad_norm": 0.78125, + "learning_rate": 1.1163769007851988e-06, + "loss": 0.02717090606689453, + "mean_token_accuracy": 0.9935029128193855, + "num_tokens": 34270532.0, + "step": 32450 + }, + { + "entropy": 0.02689198448573734, + "epoch": 2.8637756564298154, + "grad_norm": 0.59765625, + "learning_rate": 1.081192492216243e-06, + "loss": 0.02757613182067871, + "mean_token_accuracy": 0.9925964233279229, + "num_tokens": 34297695.0, + "step": 32475 + }, + { + "entropy": 0.027454499909836158, + "epoch": 2.86598029057078, + "grad_norm": 0.65234375, + "learning_rate": 1.0465684304154067e-06, + "loss": 0.0293727707862854, + "mean_token_accuracy": 0.9919403794407845, + "num_tokens": 34324699.0, + "step": 32500 + }, + { + "entropy": 0.03476064759888686, + "epoch": 2.868184924711744, + "grad_norm": 2.3125, + "learning_rate": 1.0125049115181196e-06, + "loss": 0.0358671760559082, + "mean_token_accuracy": 0.9892446520924568, + "num_tokens": 34353069.0, + "step": 32525 + }, + { + "entropy": 0.028821960146378844, + "epoch": 2.8703895588527084, + "grad_norm": 3.25, + "learning_rate": 9.79002128484463e-07, + "loss": 0.03103492021560669, + "mean_token_accuracy": 0.99189660936594, + "num_tokens": 34378573.0, + "step": 32550 + }, + { + "entropy": 0.03503867352499583, + "epoch": 2.872594192993673, + "grad_norm": 0.8125, + "learning_rate": 9.460602710981259e-07, + "loss": 0.04088228225708008, + "mean_token_accuracy": 0.9876779773831368, + "num_tokens": 34404264.0, + "step": 32575 + }, + { + "entropy": 0.026292509975028223, + "epoch": 2.874798827134637, + "grad_norm": 0.302734375, + "learning_rate": 9.136795259653386e-07, + "loss": 0.027711949348449706, + "mean_token_accuracy": 0.9927157282829284, + "num_tokens": 34429879.0, + "step": 32600 + }, + { + "entropy": 0.032493346882838524, + "epoch": 2.8770034612756015, + "grad_norm": 1.4375, + "learning_rate": 8.818600765137408e-07, + "loss": 0.03947168827056885, + "mean_token_accuracy": 0.9898266091942787, + "num_tokens": 34457557.0, + "step": 32625 + }, + { + "entropy": 0.03031579275168042, + "epoch": 2.8792080954165655, + "grad_norm": 0.384765625, + "learning_rate": 8.506021029914157e-07, + "loss": 0.034112751483917236, + "mean_token_accuracy": 0.990267367362976, + "num_tokens": 34484320.0, + "step": 32650 + }, + { + "entropy": 0.025231020506616915, + "epoch": 2.88141272955753, + "grad_norm": 1.328125, + "learning_rate": 8.199057824658574e-07, + "loss": 0.02583768844604492, + "mean_token_accuracy": 0.9929495406150818, + "num_tokens": 34509823.0, + "step": 32675 + }, + { + "entropy": 0.028853209191984207, + "epoch": 2.883617363698494, + "grad_norm": 1.8515625, + "learning_rate": 7.897712888229491e-07, + "loss": 0.0317841649055481, + "mean_token_accuracy": 0.9915751847624779, + "num_tokens": 34536481.0, + "step": 32700 + }, + { + "entropy": 0.03166955886284995, + "epoch": 2.8858219978394586, + "grad_norm": 0.62109375, + "learning_rate": 7.60198792765987e-07, + "loss": 0.039300172328948973, + "mean_token_accuracy": 0.9904856544733047, + "num_tokens": 34563378.0, + "step": 32725 + }, + { + "entropy": 0.023482872382519417, + "epoch": 2.8880266319804226, + "grad_norm": 0.404296875, + "learning_rate": 7.311884618147025e-07, + "loss": 0.03237533330917358, + "mean_token_accuracy": 0.9908891406655311, + "num_tokens": 34588598.0, + "step": 32750 + }, + { + "entropy": 0.025234425218295654, + "epoch": 2.890231266121387, + "grad_norm": 1.203125, + "learning_rate": 7.027404603043186e-07, + "loss": 0.030891218185424806, + "mean_token_accuracy": 0.9913925004005432, + "num_tokens": 34615337.0, + "step": 32775 + }, + { + "entropy": 0.027402053751684433, + "epoch": 2.8924359002623516, + "grad_norm": 0.123046875, + "learning_rate": 6.748549493846513e-07, + "loss": 0.04195026397705078, + "mean_token_accuracy": 0.9898461005091668, + "num_tokens": 34642678.0, + "step": 32800 + }, + { + "entropy": 0.023586020493894466, + "epoch": 2.8946405344033157, + "grad_norm": 2.53125, + "learning_rate": 6.475320870191315e-07, + "loss": 0.026862666606903077, + "mean_token_accuracy": 0.991207799911499, + "num_tokens": 34667892.0, + "step": 32825 + }, + { + "entropy": 0.025827234457919984, + "epoch": 2.89684516854428, + "grad_norm": 2.359375, + "learning_rate": 6.207720279839735e-07, + "loss": 0.03438200950622559, + "mean_token_accuracy": 0.9910271978378296, + "num_tokens": 34694059.0, + "step": 32850 + }, + { + "entropy": 0.02829796871297731, + "epoch": 2.8990498026852443, + "grad_norm": 0.1923828125, + "learning_rate": 5.945749238672527e-07, + "loss": 0.031097068786621093, + "mean_token_accuracy": 0.9911362925171852, + "num_tokens": 34719150.0, + "step": 32875 + }, + { + "entropy": 0.024490521577972685, + "epoch": 2.9012544368262088, + "grad_norm": 0.2578125, + "learning_rate": 5.689409230680843e-07, + "loss": 0.027147409915924074, + "mean_token_accuracy": 0.9921639716625213, + "num_tokens": 34744016.0, + "step": 32900 + }, + { + "entropy": 0.029058779828665136, + "epoch": 2.9034590709671733, + "grad_norm": 0.671875, + "learning_rate": 5.438701707957572e-07, + "loss": 0.03907189846038819, + "mean_token_accuracy": 0.9910078606009484, + "num_tokens": 34771166.0, + "step": 32925 + }, + { + "entropy": 0.02809664033673471, + "epoch": 2.9056637051081373, + "grad_norm": 1.203125, + "learning_rate": 5.193628090689018e-07, + "loss": 0.03171214580535889, + "mean_token_accuracy": 0.9920984748005867, + "num_tokens": 34796092.0, + "step": 32950 + }, + { + "entropy": 0.024929613195490676, + "epoch": 2.9078683392491014, + "grad_norm": 0.2236328125, + "learning_rate": 4.954189767147344e-07, + "loss": 0.029733285903930665, + "mean_token_accuracy": 0.9918975239992142, + "num_tokens": 34822238.0, + "step": 32975 + }, + { + "entropy": 0.02411046184010047, + "epoch": 2.910072973390066, + "grad_norm": 0.1787109375, + "learning_rate": 4.7203880936821375e-07, + "loss": 0.03335120916366577, + "mean_token_accuracy": 0.9924368995428086, + "num_tokens": 34848162.0, + "step": 33000 + }, + { + "epoch": 2.910072973390066, + "eval_entropy": 0.01513221942192457, + "eval_loss": 0.02417258359491825, + "eval_mean_token_accuracy": 0.9928157675787731, + "eval_num_tokens": 34848162.0, + "eval_runtime": 230.2757, + "eval_samples_per_second": 17.062, + "eval_steps_per_second": 4.269, + "step": 33000 + }, + { + "entropy": 0.028706058168027085, + "epoch": 2.9122776075310304, + "grad_norm": 3.40625, + "learning_rate": 4.492224394712974e-07, + "loss": 0.04000510692596435, + "mean_token_accuracy": 0.9895797765254974, + "num_tokens": 34874800.0, + "step": 33025 + }, + { + "entropy": 0.0267985630390649, + "epoch": 2.9144822416719944, + "grad_norm": 2.984375, + "learning_rate": 4.2696999627221956e-07, + "loss": 0.03146414995193481, + "mean_token_accuracy": 0.9921754291653633, + "num_tokens": 34900617.0, + "step": 33050 + }, + { + "entropy": 0.030926660760123924, + "epoch": 2.916686875812959, + "grad_norm": 1.609375, + "learning_rate": 4.052816058246811e-07, + "loss": 0.03377273797988892, + "mean_token_accuracy": 0.9911105188727379, + "num_tokens": 34928321.0, + "step": 33075 + }, + { + "entropy": 0.020773140575256546, + "epoch": 2.918891509953923, + "grad_norm": 1.46875, + "learning_rate": 3.841573909872387e-07, + "loss": 0.022910866737365723, + "mean_token_accuracy": 0.9938488656282425, + "num_tokens": 34954288.0, + "step": 33100 + }, + { + "entropy": 0.0312850209017779, + "epoch": 2.9210961440948875, + "grad_norm": 0.384765625, + "learning_rate": 3.6359747142251654e-07, + "loss": 0.0410335636138916, + "mean_token_accuracy": 0.9900199168920517, + "num_tokens": 34981329.0, + "step": 33125 + }, + { + "entropy": 0.025324092398041103, + "epoch": 2.923300778235852, + "grad_norm": 0.71875, + "learning_rate": 3.436019635965848e-07, + "loss": 0.02742032051086426, + "mean_token_accuracy": 0.9904520413279534, + "num_tokens": 35006783.0, + "step": 33150 + }, + { + "entropy": 0.031204914556074072, + "epoch": 2.925505412376816, + "grad_norm": 0.640625, + "learning_rate": 3.241709807782822e-07, + "loss": 0.04399027347564697, + "mean_token_accuracy": 0.9883441299200058, + "num_tokens": 35033941.0, + "step": 33175 + }, + { + "entropy": 0.03261162290466018, + "epoch": 2.92771004651778, + "grad_norm": 0.3828125, + "learning_rate": 3.053046330385723e-07, + "loss": 0.03782075881958008, + "mean_token_accuracy": 0.9898115587234497, + "num_tokens": 35060891.0, + "step": 33200 + }, + { + "entropy": 0.03443793041369645, + "epoch": 2.9299146806587446, + "grad_norm": 1.2734375, + "learning_rate": 2.8700302724992134e-07, + "loss": 0.04203218460083008, + "mean_token_accuracy": 0.9881391364336014, + "num_tokens": 35089278.0, + "step": 33225 + }, + { + "entropy": 0.027772624024983088, + "epoch": 2.932119314799709, + "grad_norm": 2.484375, + "learning_rate": 2.692662670856883e-07, + "loss": 0.026545183658599855, + "mean_token_accuracy": 0.9932020205259323, + "num_tokens": 35115886.0, + "step": 33250 + }, + { + "entropy": 0.030104288711700063, + "epoch": 2.934323948940673, + "grad_norm": 0.9453125, + "learning_rate": 2.520944530195579e-07, + "loss": 0.04437981128692627, + "mean_token_accuracy": 0.9900987917184829, + "num_tokens": 35143409.0, + "step": 33275 + }, + { + "entropy": 0.02149091817529552, + "epoch": 2.9365285830816377, + "grad_norm": 3.171875, + "learning_rate": 2.35487682324953e-07, + "loss": 0.027275030612945558, + "mean_token_accuracy": 0.9921983778476715, + "num_tokens": 35169840.0, + "step": 33300 + }, + { + "entropy": 0.02388738744515649, + "epoch": 2.9387332172226017, + "grad_norm": 0.05810546875, + "learning_rate": 2.1944604907446765e-07, + "loss": 0.03441377639770508, + "mean_token_accuracy": 0.9911992436647415, + "num_tokens": 35196080.0, + "step": 33325 + }, + { + "entropy": 0.03426591103016108, + "epoch": 2.9409378513635662, + "grad_norm": 1.1640625, + "learning_rate": 2.0396964413937903e-07, + "loss": 0.04174567222595215, + "mean_token_accuracy": 0.9897310450673104, + "num_tokens": 35223861.0, + "step": 33350 + }, + { + "entropy": 0.030975585428168414, + "epoch": 2.9431424855045307, + "grad_norm": 1.7265625, + "learning_rate": 1.890585551890811e-07, + "loss": 0.03444629192352295, + "mean_token_accuracy": 0.9910498291254044, + "num_tokens": 35250459.0, + "step": 33375 + }, + { + "entropy": 0.030872389084070164, + "epoch": 2.945347119645495, + "grad_norm": 1.21875, + "learning_rate": 1.747128666906517e-07, + "loss": 0.045081048011779784, + "mean_token_accuracy": 0.989509349167347, + "num_tokens": 35277415.0, + "step": 33400 + }, + { + "entropy": 0.02479782605125365, + "epoch": 2.9475517537864593, + "grad_norm": 0.73828125, + "learning_rate": 1.609326599083083e-07, + "loss": 0.025031945705413818, + "mean_token_accuracy": 0.9937001445889473, + "num_tokens": 35303505.0, + "step": 33425 + }, + { + "entropy": 0.02384750459346833, + "epoch": 2.9497563879274233, + "grad_norm": 1.2734375, + "learning_rate": 1.477180129029754e-07, + "loss": 0.02797969341278076, + "mean_token_accuracy": 0.9879170176386833, + "num_tokens": 35329409.0, + "step": 33450 + }, + { + "entropy": 0.02900122652774371, + "epoch": 2.951961022068388, + "grad_norm": 0.984375, + "learning_rate": 1.3506900053186223e-07, + "loss": 0.04034067630767822, + "mean_token_accuracy": 0.9863409793376923, + "num_tokens": 35356547.0, + "step": 33475 + }, + { + "entropy": 0.028102036683012557, + "epoch": 2.954165656209352, + "grad_norm": 1.03125, + "learning_rate": 1.229856944480079e-07, + "loss": 0.03228505849838257, + "mean_token_accuracy": 0.992490217089653, + "num_tokens": 35382846.0, + "step": 33500 + }, + { + "entropy": 0.030634857748700596, + "epoch": 2.9563702903503164, + "grad_norm": 2.34375, + "learning_rate": 1.1146816309987041e-07, + "loss": 0.03508009195327759, + "mean_token_accuracy": 0.9879120439291, + "num_tokens": 35408950.0, + "step": 33525 + }, + { + "entropy": 0.023159925354211738, + "epoch": 2.9585749244912805, + "grad_norm": 0.7734375, + "learning_rate": 1.0051647173099365e-07, + "loss": 0.024101905822753907, + "mean_token_accuracy": 0.9922597792744636, + "num_tokens": 35434210.0, + "step": 33550 + }, + { + "entropy": 0.03500574579287786, + "epoch": 2.960779558632245, + "grad_norm": 0.515625, + "learning_rate": 9.013068237956335e-08, + "loss": 0.05052834987640381, + "mean_token_accuracy": 0.9885596024990082, + "num_tokens": 35461997.0, + "step": 33575 + }, + { + "entropy": 0.026359798516132286, + "epoch": 2.9629841927732095, + "grad_norm": 0.421875, + "learning_rate": 8.031085387811832e-08, + "loss": 0.03985478401184082, + "mean_token_accuracy": 0.9904181951284409, + "num_tokens": 35487065.0, + "step": 33600 + }, + { + "entropy": 0.02394567044406358, + "epoch": 2.9651888269141735, + "grad_norm": 0.56640625, + "learning_rate": 7.105704185316197e-08, + "loss": 0.025043725967407227, + "mean_token_accuracy": 0.9922685399651527, + "num_tokens": 35511639.0, + "step": 33625 + }, + { + "entropy": 0.024473328073963786, + "epoch": 2.967393461055138, + "grad_norm": 0.9921875, + "learning_rate": 6.236929872491804e-08, + "loss": 0.02385477304458618, + "mean_token_accuracy": 0.9946402052044868, + "num_tokens": 35538356.0, + "step": 33650 + }, + { + "entropy": 0.028815567944038775, + "epoch": 2.969598095196102, + "grad_norm": 0.3828125, + "learning_rate": 5.424767370695305e-08, + "loss": 0.034439117908477784, + "mean_token_accuracy": 0.986783909201622, + "num_tokens": 35565031.0, + "step": 33675 + }, + { + "entropy": 0.03250583879053011, + "epoch": 2.9718027293370666, + "grad_norm": 0.59375, + "learning_rate": 4.6692212805965475e-08, + "loss": 0.037513306140899656, + "mean_token_accuracy": 0.9900922834873199, + "num_tokens": 35592243.0, + "step": 33700 + }, + { + "entropy": 0.027665380477410507, + "epoch": 2.974007363478031, + "grad_norm": 0.30859375, + "learning_rate": 3.9702958821463684e-08, + "loss": 0.034902803897857666, + "mean_token_accuracy": 0.9913255712389946, + "num_tokens": 35618924.0, + "step": 33725 + }, + { + "entropy": 0.026782970013227894, + "epoch": 2.976211997618995, + "grad_norm": 0.259765625, + "learning_rate": 3.3279951345577265e-08, + "loss": 0.044511117935180665, + "mean_token_accuracy": 0.9920956519246101, + "num_tokens": 35645678.0, + "step": 33750 + }, + { + "entropy": 0.032550619621688384, + "epoch": 2.978416631759959, + "grad_norm": 0.076171875, + "learning_rate": 2.7423226762812725e-08, + "loss": 0.042112269401550294, + "mean_token_accuracy": 0.9892310863733291, + "num_tokens": 35673477.0, + "step": 33775 + }, + { + "entropy": 0.02943160523022016, + "epoch": 2.9806212659009237, + "grad_norm": 2.515625, + "learning_rate": 2.213281824984259e-08, + "loss": 0.03308338642120361, + "mean_token_accuracy": 0.9916199234127998, + "num_tokens": 35699973.0, + "step": 33800 + }, + { + "entropy": 0.030929924099600613, + "epoch": 2.982825900041888, + "grad_norm": 0.453125, + "learning_rate": 1.740875577531664e-08, + "loss": 0.04199523448944092, + "mean_token_accuracy": 0.984671610891819, + "num_tokens": 35727474.0, + "step": 33825 + }, + { + "entropy": 0.03162372545186372, + "epoch": 2.9850305341828522, + "grad_norm": 0.66796875, + "learning_rate": 1.3251066099684295e-08, + "loss": 0.03665184259414673, + "mean_token_accuracy": 0.991539233326912, + "num_tokens": 35754131.0, + "step": 33850 + }, + { + "entropy": 0.02623558983214025, + "epoch": 2.9872351683238167, + "grad_norm": 1.8671875, + "learning_rate": 9.659772775094666e-09, + "loss": 0.030340723991394043, + "mean_token_accuracy": 0.9928580421209335, + "num_tokens": 35780165.0, + "step": 33875 + }, + { + "entropy": 0.023927184478125127, + "epoch": 2.989439802464781, + "grad_norm": 0.06396484375, + "learning_rate": 6.634896145185643e-09, + "loss": 0.03702516794204712, + "mean_token_accuracy": 0.9920265239477157, + "num_tokens": 35805641.0, + "step": 33900 + }, + { + "entropy": 0.022169054430632967, + "epoch": 2.9916444366057453, + "grad_norm": 0.96875, + "learning_rate": 4.176453345017262e-09, + "loss": 0.025482571125030516, + "mean_token_accuracy": 0.9918701857328415, + "num_tokens": 35830790.0, + "step": 33925 + }, + { + "entropy": 0.025008084610890366, + "epoch": 2.99384907074671, + "grad_norm": 0.80859375, + "learning_rate": 2.28445830096069e-09, + "loss": 0.0239717435836792, + "mean_token_accuracy": 0.9920974162220955, + "num_tokens": 35856028.0, + "step": 33950 + }, + { + "entropy": 0.02687357971597521, + "epoch": 2.996053704887674, + "grad_norm": 0.10546875, + "learning_rate": 9.589217306316123e-10, + "loss": 0.03782797336578369, + "mean_token_accuracy": 0.9907488691806793, + "num_tokens": 35882072.0, + "step": 33975 + }, + { + "entropy": 0.025469130103374482, + "epoch": 2.9982583390286384, + "grad_norm": 1.6484375, + "learning_rate": 1.9985114282361493e-10, + "loss": 0.027287905216217042, + "mean_token_accuracy": 0.9933070641756058, + "num_tokens": 35907980.0, + "step": 34000 + }, + { + "epoch": 2.9982583390286384, + "eval_entropy": 0.015145191631567904, + "eval_loss": 0.024168651551008224, + "eval_mean_token_accuracy": 0.9928109939925426, + "eval_num_tokens": 35907980.0, + "eval_runtime": 225.267, + "eval_samples_per_second": 17.442, + "eval_steps_per_second": 4.364, + "step": 34000 + }, + { + "epoch": 3.0, + "eval_entropy": 0.015145191631567904, + "eval_loss": 0.024168651551008224, + "eval_mean_token_accuracy": 0.9928109939925426, + "eval_num_tokens": 35928990.0, + "eval_runtime": 225.9543, + "eval_samples_per_second": 17.388, + "eval_steps_per_second": 4.35, + "step": 34020 + } + ], + "logging_steps": 25, + "max_steps": 34020, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.569718455428137e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}