{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8021390374331551, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3211480975151062, "epoch": 0.0005347593582887701, "grad_norm": 1.212019681930542, "learning_rate": 0.0, "loss": 1.8108, "mean_token_accuracy": 0.6068548411130905, "num_tokens": 32768.0, "step": 1 }, { "entropy": 1.319978803396225, "epoch": 0.0010695187165775401, "grad_norm": 1.2323760986328125, "learning_rate": 3.5087719298245615e-06, "loss": 1.8242, "mean_token_accuracy": 0.6032210290431976, "num_tokens": 65399.0, "step": 2 }, { "entropy": 1.3422840237617493, "epoch": 0.0016042780748663102, "grad_norm": 1.2383126020431519, "learning_rate": 7.017543859649123e-06, "loss": 1.8527, "mean_token_accuracy": 0.5934058427810669, "num_tokens": 97907.0, "step": 3 }, { "entropy": 1.342028558254242, "epoch": 0.0021390374331550803, "grad_norm": 1.2295197248458862, "learning_rate": 1.0526315789473684e-05, "loss": 1.8324, "mean_token_accuracy": 0.6023643612861633, "num_tokens": 130675.0, "step": 4 }, { "entropy": 1.36601322889328, "epoch": 0.00267379679144385, "grad_norm": 1.1857184171676636, "learning_rate": 1.4035087719298246e-05, "loss": 1.8561, "mean_token_accuracy": 0.5968963950872421, "num_tokens": 163443.0, "step": 5 }, { "entropy": 1.3593934178352356, "epoch": 0.0032085561497326204, "grad_norm": 1.140973448753357, "learning_rate": 1.7543859649122806e-05, "loss": 1.8346, "mean_token_accuracy": 0.5957307517528534, "num_tokens": 195506.0, "step": 6 }, { "entropy": 1.4132839739322662, "epoch": 0.0037433155080213902, "grad_norm": 1.1102650165557861, "learning_rate": 2.105263157894737e-05, "loss": 1.8617, "mean_token_accuracy": 0.5902331173419952, "num_tokens": 228061.0, "step": 7 }, { "entropy": 1.331324964761734, "epoch": 0.0042780748663101605, "grad_norm": 1.0002244710922241, "learning_rate": 2.456140350877193e-05, "loss": 1.741, "mean_token_accuracy": 0.6074766218662262, "num_tokens": 260804.0, "step": 8 }, { "entropy": 1.426033079624176, "epoch": 0.004812834224598931, "grad_norm": 0.8240267038345337, "learning_rate": 2.8070175438596492e-05, "loss": 1.7741, "mean_token_accuracy": 0.6042277812957764, "num_tokens": 293572.0, "step": 9 }, { "entropy": 1.4304762780666351, "epoch": 0.0053475935828877, "grad_norm": 0.6928900480270386, "learning_rate": 3.157894736842105e-05, "loss": 1.7112, "mean_token_accuracy": 0.6084738671779633, "num_tokens": 326340.0, "step": 10 }, { "entropy": 1.4698238670825958, "epoch": 0.0058823529411764705, "grad_norm": 0.6097795367240906, "learning_rate": 3.508771929824561e-05, "loss": 1.6587, "mean_token_accuracy": 0.6164537668228149, "num_tokens": 358594.0, "step": 11 }, { "entropy": 1.5248451828956604, "epoch": 0.006417112299465241, "grad_norm": 0.5526272058486938, "learning_rate": 3.859649122807018e-05, "loss": 1.6538, "mean_token_accuracy": 0.6143695116043091, "num_tokens": 391362.0, "step": 12 }, { "entropy": 1.5860177278518677, "epoch": 0.006951871657754011, "grad_norm": 0.5539830923080444, "learning_rate": 4.210526315789474e-05, "loss": 1.67, "mean_token_accuracy": 0.6128115952014923, "num_tokens": 424130.0, "step": 13 }, { "entropy": 1.5437233448028564, "epoch": 0.0074866310160427805, "grad_norm": 0.6331645846366882, "learning_rate": 4.56140350877193e-05, "loss": 1.5714, "mean_token_accuracy": 0.6330629885196686, "num_tokens": 456485.0, "step": 14 }, { "entropy": 1.6081304848194122, "epoch": 0.008021390374331552, "grad_norm": 0.6115171909332275, "learning_rate": 4.912280701754386e-05, "loss": 1.5736, "mean_token_accuracy": 0.6276161074638367, "num_tokens": 489052.0, "step": 15 }, { "entropy": 1.547630399465561, "epoch": 0.008556149732620321, "grad_norm": 0.5796217918395996, "learning_rate": 5.2631578947368424e-05, "loss": 1.475, "mean_token_accuracy": 0.6473007500171661, "num_tokens": 521691.0, "step": 16 }, { "entropy": 1.6032908260822296, "epoch": 0.00909090909090909, "grad_norm": 0.5352026224136353, "learning_rate": 5.6140350877192984e-05, "loss": 1.475, "mean_token_accuracy": 0.6525703221559525, "num_tokens": 554357.0, "step": 17 }, { "entropy": 1.5571948289871216, "epoch": 0.009625668449197862, "grad_norm": 0.49015113711357117, "learning_rate": 5.9649122807017544e-05, "loss": 1.4005, "mean_token_accuracy": 0.669808566570282, "num_tokens": 586990.0, "step": 18 }, { "entropy": 1.5882920324802399, "epoch": 0.010160427807486631, "grad_norm": 0.5017333030700684, "learning_rate": 6.31578947368421e-05, "loss": 1.408, "mean_token_accuracy": 0.6676928251981735, "num_tokens": 619449.0, "step": 19 }, { "entropy": 1.5626413524150848, "epoch": 0.0106951871657754, "grad_norm": 0.5140109062194824, "learning_rate": 6.666666666666667e-05, "loss": 1.3925, "mean_token_accuracy": 0.6779542714357376, "num_tokens": 652165.0, "step": 20 }, { "entropy": 1.4453319311141968, "epoch": 0.011229946524064172, "grad_norm": 0.5080186128616333, "learning_rate": 7.017543859649122e-05, "loss": 1.301, "mean_token_accuracy": 0.6922348439693451, "num_tokens": 684933.0, "step": 21 }, { "entropy": 1.4204711019992828, "epoch": 0.011764705882352941, "grad_norm": 0.5037912130355835, "learning_rate": 7.368421052631579e-05, "loss": 1.3096, "mean_token_accuracy": 0.690493643283844, "num_tokens": 717701.0, "step": 22 }, { "entropy": 1.3163996636867523, "epoch": 0.01229946524064171, "grad_norm": 0.47926607728004456, "learning_rate": 7.719298245614036e-05, "loss": 1.2587, "mean_token_accuracy": 0.7055350989103317, "num_tokens": 750036.0, "step": 23 }, { "entropy": 1.1758967638015747, "epoch": 0.012834224598930482, "grad_norm": 0.4284074306488037, "learning_rate": 8.070175438596491e-05, "loss": 1.1893, "mean_token_accuracy": 0.7212548851966858, "num_tokens": 782804.0, "step": 24 }, { "entropy": 1.1575067341327667, "epoch": 0.013368983957219251, "grad_norm": 0.4281991720199585, "learning_rate": 8.421052631578948e-05, "loss": 1.2208, "mean_token_accuracy": 0.7123541980981827, "num_tokens": 815474.0, "step": 25 }, { "entropy": 1.0813260674476624, "epoch": 0.013903743315508022, "grad_norm": 0.38478702306747437, "learning_rate": 8.771929824561403e-05, "loss": 1.1507, "mean_token_accuracy": 0.7251185774803162, "num_tokens": 848059.0, "step": 26 }, { "entropy": 1.0400638580322266, "epoch": 0.014438502673796792, "grad_norm": 0.36128902435302734, "learning_rate": 9.12280701754386e-05, "loss": 1.1269, "mean_token_accuracy": 0.7298998236656189, "num_tokens": 880827.0, "step": 27 }, { "entropy": 1.0808460414409637, "epoch": 0.014973262032085561, "grad_norm": 0.3295935094356537, "learning_rate": 9.473684210526316e-05, "loss": 1.1262, "mean_token_accuracy": 0.7256537228822708, "num_tokens": 913595.0, "step": 28 }, { "entropy": 1.1501913368701935, "epoch": 0.015508021390374332, "grad_norm": 0.29811128973960876, "learning_rate": 9.824561403508771e-05, "loss": 1.1599, "mean_token_accuracy": 0.7209208011627197, "num_tokens": 945634.0, "step": 29 }, { "entropy": 1.173025131225586, "epoch": 0.016042780748663103, "grad_norm": 0.2845797836780548, "learning_rate": 0.0001017543859649123, "loss": 1.135, "mean_token_accuracy": 0.721837192773819, "num_tokens": 978214.0, "step": 30 }, { "entropy": 1.1669634282588959, "epoch": 0.016577540106951873, "grad_norm": 0.2711884379386902, "learning_rate": 0.00010526315789473685, "loss": 1.1043, "mean_token_accuracy": 0.7299549430608749, "num_tokens": 1010857.0, "step": 31 }, { "entropy": 1.206926554441452, "epoch": 0.017112299465240642, "grad_norm": 0.25197407603263855, "learning_rate": 0.00010877192982456141, "loss": 1.1569, "mean_token_accuracy": 0.7189331501722336, "num_tokens": 1043571.0, "step": 32 }, { "entropy": 1.169577717781067, "epoch": 0.01764705882352941, "grad_norm": 0.21474692225456238, "learning_rate": 0.00011228070175438597, "loss": 1.1459, "mean_token_accuracy": 0.7232099175453186, "num_tokens": 1076339.0, "step": 33 }, { "entropy": 1.1237863898277283, "epoch": 0.01818181818181818, "grad_norm": 0.18793237209320068, "learning_rate": 0.00011578947368421053, "loss": 1.1102, "mean_token_accuracy": 0.724839448928833, "num_tokens": 1109022.0, "step": 34 }, { "entropy": 1.1527383029460907, "epoch": 0.01871657754010695, "grad_norm": 0.17025186121463776, "learning_rate": 0.00011929824561403509, "loss": 1.1621, "mean_token_accuracy": 0.7170107066631317, "num_tokens": 1141346.0, "step": 35 }, { "entropy": 1.1023530662059784, "epoch": 0.019251336898395723, "grad_norm": 0.16425541043281555, "learning_rate": 0.00012280701754385965, "loss": 1.1279, "mean_token_accuracy": 0.7268872112035751, "num_tokens": 1174021.0, "step": 36 }, { "entropy": 1.1330838203430176, "epoch": 0.019786096256684493, "grad_norm": 0.15079551935195923, "learning_rate": 0.0001263157894736842, "loss": 1.1532, "mean_token_accuracy": 0.7206839919090271, "num_tokens": 1206463.0, "step": 37 }, { "entropy": 1.099076896905899, "epoch": 0.020320855614973262, "grad_norm": 0.1482747495174408, "learning_rate": 0.0001298245614035088, "loss": 1.1045, "mean_token_accuracy": 0.7323130667209625, "num_tokens": 1239231.0, "step": 38 }, { "entropy": 1.1178455352783203, "epoch": 0.02085561497326203, "grad_norm": 0.14701837301254272, "learning_rate": 0.00013333333333333334, "loss": 1.126, "mean_token_accuracy": 0.724346861243248, "num_tokens": 1271872.0, "step": 39 }, { "entropy": 1.105290025472641, "epoch": 0.0213903743315508, "grad_norm": 0.13479559123516083, "learning_rate": 0.0001368421052631579, "loss": 1.0997, "mean_token_accuracy": 0.7327084541320801, "num_tokens": 1304443.0, "step": 40 }, { "entropy": 1.0889496803283691, "epoch": 0.021925133689839574, "grad_norm": 0.13660141825675964, "learning_rate": 0.00014035087719298245, "loss": 1.0839, "mean_token_accuracy": 0.7350012362003326, "num_tokens": 1337211.0, "step": 41 }, { "entropy": 1.0803407430648804, "epoch": 0.022459893048128343, "grad_norm": 0.13740068674087524, "learning_rate": 0.00014385964912280703, "loss": 1.0622, "mean_token_accuracy": 0.7330767512321472, "num_tokens": 1369979.0, "step": 42 }, { "entropy": 1.1109019815921783, "epoch": 0.022994652406417113, "grad_norm": 0.13445861637592316, "learning_rate": 0.00014736842105263158, "loss": 1.098, "mean_token_accuracy": 0.7316307574510574, "num_tokens": 1402446.0, "step": 43 }, { "entropy": 1.0604596138000488, "epoch": 0.023529411764705882, "grad_norm": 0.13360567390918732, "learning_rate": 0.00015087719298245616, "loss": 1.0433, "mean_token_accuracy": 0.740469217300415, "num_tokens": 1435214.0, "step": 44 }, { "entropy": 1.0373520702123642, "epoch": 0.02406417112299465, "grad_norm": 0.135879784822464, "learning_rate": 0.0001543859649122807, "loss": 1.0364, "mean_token_accuracy": 0.7431574016809464, "num_tokens": 1467242.0, "step": 45 }, { "entropy": 1.0861902832984924, "epoch": 0.02459893048128342, "grad_norm": 0.12583845853805542, "learning_rate": 0.00015789473684210527, "loss": 1.0982, "mean_token_accuracy": 0.7279447764158249, "num_tokens": 1500010.0, "step": 46 }, { "entropy": 1.1126323342323303, "epoch": 0.025133689839572194, "grad_norm": 0.13058513402938843, "learning_rate": 0.00016140350877192982, "loss": 1.1172, "mean_token_accuracy": 0.7277615070343018, "num_tokens": 1532778.0, "step": 47 }, { "entropy": 1.0311926305294037, "epoch": 0.025668449197860963, "grad_norm": 0.13519766926765442, "learning_rate": 0.0001649122807017544, "loss": 1.0401, "mean_token_accuracy": 0.74251589179039, "num_tokens": 1565546.0, "step": 48 }, { "entropy": 1.047630786895752, "epoch": 0.026203208556149733, "grad_norm": 0.12377266585826874, "learning_rate": 0.00016842105263157895, "loss": 1.0609, "mean_token_accuracy": 0.7348484992980957, "num_tokens": 1598314.0, "step": 49 }, { "entropy": 1.092830628156662, "epoch": 0.026737967914438502, "grad_norm": 0.14160823822021484, "learning_rate": 0.00017192982456140353, "loss": 1.1167, "mean_token_accuracy": 0.7267261147499084, "num_tokens": 1630989.0, "step": 50 }, { "entropy": 1.0829817354679108, "epoch": 0.02727272727272727, "grad_norm": 0.1272098571062088, "learning_rate": 0.00017543859649122806, "loss": 1.0993, "mean_token_accuracy": 0.7326164692640305, "num_tokens": 1663449.0, "step": 51 }, { "entropy": 1.1042539179325104, "epoch": 0.027807486631016044, "grad_norm": 0.13002848625183105, "learning_rate": 0.00017894736842105264, "loss": 1.1247, "mean_token_accuracy": 0.7220796793699265, "num_tokens": 1696217.0, "step": 52 }, { "entropy": 0.9782739132642746, "epoch": 0.028342245989304814, "grad_norm": 0.1287028193473816, "learning_rate": 0.0001824561403508772, "loss": 0.9925, "mean_token_accuracy": 0.7520161271095276, "num_tokens": 1728985.0, "step": 53 }, { "entropy": 1.0866808593273163, "epoch": 0.028877005347593583, "grad_norm": 0.12313037365674973, "learning_rate": 0.00018596491228070177, "loss": 1.0956, "mean_token_accuracy": 0.72519551217556, "num_tokens": 1761753.0, "step": 54 }, { "entropy": 1.0856190919876099, "epoch": 0.029411764705882353, "grad_norm": 0.129723459482193, "learning_rate": 0.00018947368421052632, "loss": 1.0936, "mean_token_accuracy": 0.7340177297592163, "num_tokens": 1794424.0, "step": 55 }, { "entropy": 1.059571549296379, "epoch": 0.029946524064171122, "grad_norm": 0.12538288533687592, "learning_rate": 0.00019298245614035088, "loss": 1.0644, "mean_token_accuracy": 0.7346231341362, "num_tokens": 1827133.0, "step": 56 }, { "entropy": 1.0712441504001617, "epoch": 0.03048128342245989, "grad_norm": 0.12101280689239502, "learning_rate": 0.00019649122807017543, "loss": 1.0796, "mean_token_accuracy": 0.7299673855304718, "num_tokens": 1859852.0, "step": 57 }, { "entropy": 1.0577708184719086, "epoch": 0.031016042780748664, "grad_norm": 0.12647317349910736, "learning_rate": 0.0002, "loss": 1.0632, "mean_token_accuracy": 0.7361465245485306, "num_tokens": 1892494.0, "step": 58 }, { "entropy": 1.0639779716730118, "epoch": 0.03155080213903743, "grad_norm": 0.22237052023410797, "learning_rate": 0.0001999998648809627, "loss": 1.0602, "mean_token_accuracy": 0.7368035316467285, "num_tokens": 1925262.0, "step": 59 }, { "entropy": 1.0609510242938995, "epoch": 0.03208556149732621, "grad_norm": 0.3154584765434265, "learning_rate": 0.00019999945952425653, "loss": 1.0658, "mean_token_accuracy": 0.7347353398799896, "num_tokens": 1957815.0, "step": 60 }, { "entropy": 1.1127621531486511, "epoch": 0.032620320855614976, "grad_norm": 0.15080197155475616, "learning_rate": 0.0001999987839310986, "loss": 1.1205, "mean_token_accuracy": 0.7253787964582443, "num_tokens": 1990583.0, "step": 61 }, { "entropy": 1.1040469706058502, "epoch": 0.033155080213903745, "grad_norm": 0.16909120976924896, "learning_rate": 0.00019999783810351746, "loss": 1.0834, "mean_token_accuracy": 0.7315352112054825, "num_tokens": 2023225.0, "step": 62 }, { "entropy": 1.0955560803413391, "epoch": 0.033689839572192515, "grad_norm": 0.16623055934906006, "learning_rate": 0.00019999662204435317, "loss": 1.0766, "mean_token_accuracy": 0.7339774519205093, "num_tokens": 2055864.0, "step": 63 }, { "entropy": 1.1271432042121887, "epoch": 0.034224598930481284, "grad_norm": 0.12925198674201965, "learning_rate": 0.00019999513575725706, "loss": 1.1286, "mean_token_accuracy": 0.724672481417656, "num_tokens": 2088299.0, "step": 64 }, { "entropy": 1.0197514593601227, "epoch": 0.034759358288770054, "grad_norm": 0.1556098461151123, "learning_rate": 0.00019999337924669195, "loss": 1.0207, "mean_token_accuracy": 0.7488697469234467, "num_tokens": 2121067.0, "step": 65 }, { "entropy": 1.029851257801056, "epoch": 0.03529411764705882, "grad_norm": 0.15798597037792206, "learning_rate": 0.00019999135251793203, "loss": 1.0195, "mean_token_accuracy": 0.7466137111186981, "num_tokens": 2153662.0, "step": 66 }, { "entropy": 1.049491971731186, "epoch": 0.03582887700534759, "grad_norm": 0.11463714390993118, "learning_rate": 0.0001999890555770628, "loss": 1.05, "mean_token_accuracy": 0.7364568263292313, "num_tokens": 2186282.0, "step": 67 }, { "entropy": 1.0518030226230621, "epoch": 0.03636363636363636, "grad_norm": 0.14380760490894318, "learning_rate": 0.0001999864884309812, "loss": 1.0533, "mean_token_accuracy": 0.7368646264076233, "num_tokens": 2219050.0, "step": 68 }, { "entropy": 1.043479710817337, "epoch": 0.03689839572192513, "grad_norm": 0.14671902358531952, "learning_rate": 0.00019998365108739544, "loss": 1.0646, "mean_token_accuracy": 0.740808829665184, "num_tokens": 2250998.0, "step": 69 }, { "entropy": 1.0794665217399597, "epoch": 0.0374331550802139, "grad_norm": 0.12887804210186005, "learning_rate": 0.00019998054355482508, "loss": 1.1062, "mean_token_accuracy": 0.729044497013092, "num_tokens": 2283766.0, "step": 70 }, { "entropy": 1.1021603047847748, "epoch": 0.03796791443850268, "grad_norm": 0.13224957883358002, "learning_rate": 0.00019997716584260088, "loss": 1.121, "mean_token_accuracy": 0.7261119335889816, "num_tokens": 2316534.0, "step": 71 }, { "entropy": 1.0996226370334625, "epoch": 0.038502673796791446, "grad_norm": 0.12894891202449799, "learning_rate": 0.000199973517960865, "loss": 1.0928, "mean_token_accuracy": 0.7264660596847534, "num_tokens": 2349140.0, "step": 72 }, { "entropy": 1.1015254855155945, "epoch": 0.039037433155080216, "grad_norm": 0.11442919820547104, "learning_rate": 0.00019996959992057067, "loss": 1.0973, "mean_token_accuracy": 0.7288917452096939, "num_tokens": 2381908.0, "step": 73 }, { "entropy": 1.0996100306510925, "epoch": 0.039572192513368985, "grad_norm": 0.13185158371925354, "learning_rate": 0.00019996541173348238, "loss": 1.0737, "mean_token_accuracy": 0.73190838098526, "num_tokens": 2414346.0, "step": 74 }, { "entropy": 1.0455727726221085, "epoch": 0.040106951871657755, "grad_norm": 0.11635924875736237, "learning_rate": 0.0001999609534121758, "loss": 1.0108, "mean_token_accuracy": 0.7491058707237244, "num_tokens": 2446922.0, "step": 75 }, { "entropy": 1.0418742299079895, "epoch": 0.040641711229946524, "grad_norm": 0.11113014817237854, "learning_rate": 0.00019995622497003768, "loss": 1.023, "mean_token_accuracy": 0.7431905567646027, "num_tokens": 2479505.0, "step": 76 }, { "entropy": 1.0230610966682434, "epoch": 0.041176470588235294, "grad_norm": 0.10827413946390152, "learning_rate": 0.00019995122642126582, "loss": 1.0158, "mean_token_accuracy": 0.7468015551567078, "num_tokens": 2512204.0, "step": 77 }, { "entropy": 1.037041187286377, "epoch": 0.04171122994652406, "grad_norm": 0.12525281310081482, "learning_rate": 0.00019994595778086917, "loss": 1.0359, "mean_token_accuracy": 0.7424547970294952, "num_tokens": 2544972.0, "step": 78 }, { "entropy": 1.0368339121341705, "epoch": 0.04224598930481283, "grad_norm": 0.11969482898712158, "learning_rate": 0.00019994041906466745, "loss": 1.0618, "mean_token_accuracy": 0.7377283573150635, "num_tokens": 2577602.0, "step": 79 }, { "entropy": 1.0786939859390259, "epoch": 0.0427807486631016, "grad_norm": 0.12571953237056732, "learning_rate": 0.00019993461028929157, "loss": 1.0884, "mean_token_accuracy": 0.7278703451156616, "num_tokens": 2609992.0, "step": 80 }, { "entropy": 1.0542809069156647, "epoch": 0.04331550802139037, "grad_norm": 0.10666581988334656, "learning_rate": 0.0001999285314721832, "loss": 1.0673, "mean_token_accuracy": 0.7366433292627335, "num_tokens": 2642590.0, "step": 81 }, { "entropy": 1.073119729757309, "epoch": 0.04385026737967915, "grad_norm": 0.12644757330417633, "learning_rate": 0.00019992218263159485, "loss": 1.0759, "mean_token_accuracy": 0.733117088675499, "num_tokens": 2674660.0, "step": 82 }, { "entropy": 1.0488686561584473, "epoch": 0.04438502673796792, "grad_norm": 0.1137339323759079, "learning_rate": 0.00019991556378658992, "loss": 1.0481, "mean_token_accuracy": 0.7383919954299927, "num_tokens": 2707428.0, "step": 83 }, { "entropy": 1.0880440175533295, "epoch": 0.044919786096256686, "grad_norm": 0.11253127455711365, "learning_rate": 0.0001999086749570423, "loss": 1.0823, "mean_token_accuracy": 0.7314790040254593, "num_tokens": 2740103.0, "step": 84 }, { "entropy": 1.0373365581035614, "epoch": 0.045454545454545456, "grad_norm": 0.10806500911712646, "learning_rate": 0.00019990151616363684, "loss": 1.0379, "mean_token_accuracy": 0.7419739663600922, "num_tokens": 2772727.0, "step": 85 }, { "entropy": 1.0969046205282211, "epoch": 0.045989304812834225, "grad_norm": 0.1056627705693245, "learning_rate": 0.00019989408742786876, "loss": 1.0811, "mean_token_accuracy": 0.7304458320140839, "num_tokens": 2805247.0, "step": 86 }, { "entropy": 1.06920325756073, "epoch": 0.046524064171122995, "grad_norm": 0.11184051632881165, "learning_rate": 0.00019988638877204402, "loss": 1.0608, "mean_token_accuracy": 0.7345244139432907, "num_tokens": 2837817.0, "step": 87 }, { "entropy": 1.0635611712932587, "epoch": 0.047058823529411764, "grad_norm": 0.10178792476654053, "learning_rate": 0.00019987842021927887, "loss": 1.0573, "mean_token_accuracy": 0.7352456152439117, "num_tokens": 2870585.0, "step": 88 }, { "entropy": 1.0648300647735596, "epoch": 0.04759358288770053, "grad_norm": 0.10483819246292114, "learning_rate": 0.00019987018179350006, "loss": 1.0668, "mean_token_accuracy": 0.7302337884902954, "num_tokens": 2903134.0, "step": 89 }, { "entropy": 1.0471554398536682, "epoch": 0.0481283422459893, "grad_norm": 0.10856444388628006, "learning_rate": 0.00019986167351944466, "loss": 1.0554, "mean_token_accuracy": 0.73912513256073, "num_tokens": 2935902.0, "step": 90 }, { "entropy": 1.0166204869747162, "epoch": 0.04866310160427807, "grad_norm": 0.11259638518095016, "learning_rate": 0.00019985289542266002, "loss": 1.0351, "mean_token_accuracy": 0.7440738081932068, "num_tokens": 2968670.0, "step": 91 }, { "entropy": 1.046201303601265, "epoch": 0.04919786096256684, "grad_norm": 0.18939243257045746, "learning_rate": 0.00019984384752950364, "loss": 1.0577, "mean_token_accuracy": 0.7357452511787415, "num_tokens": 3000903.0, "step": 92 }, { "entropy": 1.0445843040943146, "epoch": 0.04973262032085562, "grad_norm": 0.11610482633113861, "learning_rate": 0.00019983452986714316, "loss": 1.0697, "mean_token_accuracy": 0.7375977635383606, "num_tokens": 3033671.0, "step": 93 }, { "entropy": 1.1029761135578156, "epoch": 0.05026737967914439, "grad_norm": 0.11175025254487991, "learning_rate": 0.00019982494246355617, "loss": 1.1181, "mean_token_accuracy": 0.725279837846756, "num_tokens": 3066347.0, "step": 94 }, { "entropy": 1.0046235769987106, "epoch": 0.05080213903743316, "grad_norm": 0.10787836462259293, "learning_rate": 0.00019981508534753028, "loss": 0.986, "mean_token_accuracy": 0.751044511795044, "num_tokens": 3098766.0, "step": 95 }, { "entropy": 1.0180357545614243, "epoch": 0.051336898395721926, "grad_norm": 0.10719222575426102, "learning_rate": 0.00019980495854866287, "loss": 1.0098, "mean_token_accuracy": 0.7471800893545151, "num_tokens": 3131448.0, "step": 96 }, { "entropy": 1.086255043745041, "epoch": 0.051871657754010696, "grad_norm": 0.1420079469680786, "learning_rate": 0.0001997945620973612, "loss": 1.0721, "mean_token_accuracy": 0.7333822250366211, "num_tokens": 3164216.0, "step": 97 }, { "entropy": 1.0434383749961853, "epoch": 0.052406417112299465, "grad_norm": 0.11976777762174606, "learning_rate": 0.00019978389602484203, "loss": 1.0372, "mean_token_accuracy": 0.7404081225395203, "num_tokens": 3196984.0, "step": 98 }, { "entropy": 1.0948996245861053, "epoch": 0.052941176470588235, "grad_norm": 0.11928348988294601, "learning_rate": 0.00019977296036313182, "loss": 1.119, "mean_token_accuracy": 0.7248346358537674, "num_tokens": 3229387.0, "step": 99 }, { "entropy": 1.0682441890239716, "epoch": 0.053475935828877004, "grad_norm": 0.11096571385860443, "learning_rate": 0.00019976175514506655, "loss": 1.0658, "mean_token_accuracy": 0.7372749745845795, "num_tokens": 3262145.0, "step": 100 }, { "entropy": 1.0616617947816849, "epoch": 0.05401069518716577, "grad_norm": 0.1181572750210762, "learning_rate": 0.00019975028040429145, "loss": 1.0604, "mean_token_accuracy": 0.7366300523281097, "num_tokens": 3294504.0, "step": 101 }, { "entropy": 1.0692074596881866, "epoch": 0.05454545454545454, "grad_norm": 0.11676756292581558, "learning_rate": 0.0001997385361752611, "loss": 1.069, "mean_token_accuracy": 0.7343024760484695, "num_tokens": 3326611.0, "step": 102 }, { "entropy": 1.0682894587516785, "epoch": 0.05508021390374331, "grad_norm": 0.11502093076705933, "learning_rate": 0.00019972652249323924, "loss": 1.0762, "mean_token_accuracy": 0.7318246513605118, "num_tokens": 3358661.0, "step": 103 }, { "entropy": 1.0630333721637726, "epoch": 0.05561497326203209, "grad_norm": 0.11878052353858948, "learning_rate": 0.00019971423939429877, "loss": 1.0523, "mean_token_accuracy": 0.7361778318881989, "num_tokens": 3391416.0, "step": 104 }, { "entropy": 1.0106346756219864, "epoch": 0.05614973262032086, "grad_norm": 0.10611041635274887, "learning_rate": 0.0001997016869153214, "loss": 1.0027, "mean_token_accuracy": 0.747006356716156, "num_tokens": 3424184.0, "step": 105 }, { "entropy": 1.0402246713638306, "epoch": 0.05668449197860963, "grad_norm": 0.11598736047744751, "learning_rate": 0.00019968886509399785, "loss": 1.051, "mean_token_accuracy": 0.7395623475313187, "num_tokens": 3456920.0, "step": 106 }, { "entropy": 1.0482510924339294, "epoch": 0.0572192513368984, "grad_norm": 0.11055859923362732, "learning_rate": 0.00019967577396882738, "loss": 1.0407, "mean_token_accuracy": 0.7408681511878967, "num_tokens": 3489392.0, "step": 107 }, { "entropy": 1.081942230463028, "epoch": 0.057754010695187166, "grad_norm": 0.11166244745254517, "learning_rate": 0.0001996624135791181, "loss": 1.0967, "mean_token_accuracy": 0.7317631989717484, "num_tokens": 3522160.0, "step": 108 }, { "entropy": 1.0274831652641296, "epoch": 0.058288770053475936, "grad_norm": 0.12437647581100464, "learning_rate": 0.0001996487839649865, "loss": 1.038, "mean_token_accuracy": 0.7414470911026001, "num_tokens": 3554175.0, "step": 109 }, { "entropy": 1.0478383004665375, "epoch": 0.058823529411764705, "grad_norm": 0.10964363068342209, "learning_rate": 0.00019963488516735745, "loss": 1.0471, "mean_token_accuracy": 0.7395887672901154, "num_tokens": 3586689.0, "step": 110 }, { "entropy": 1.0617424249649048, "epoch": 0.059358288770053474, "grad_norm": 0.12272202223539352, "learning_rate": 0.00019962071722796416, "loss": 1.0658, "mean_token_accuracy": 0.7380151003599167, "num_tokens": 3618950.0, "step": 111 }, { "entropy": 1.0538257956504822, "epoch": 0.059893048128342244, "grad_norm": 0.1245708093047142, "learning_rate": 0.00019960628018934786, "loss": 1.0585, "mean_token_accuracy": 0.7368472963571548, "num_tokens": 3651324.0, "step": 112 }, { "entropy": 1.0768160820007324, "epoch": 0.06042780748663101, "grad_norm": 0.1088668629527092, "learning_rate": 0.00019959157409485788, "loss": 1.0647, "mean_token_accuracy": 0.7375608086585999, "num_tokens": 3683822.0, "step": 113 }, { "entropy": 1.0708918273448944, "epoch": 0.06096256684491978, "grad_norm": 0.11890657991170883, "learning_rate": 0.00019957659898865143, "loss": 1.0666, "mean_token_accuracy": 0.7333291918039322, "num_tokens": 3716478.0, "step": 114 }, { "entropy": 1.071320116519928, "epoch": 0.06149732620320856, "grad_norm": 0.12597399950027466, "learning_rate": 0.00019956135491569344, "loss": 1.0616, "mean_token_accuracy": 0.7387379854917526, "num_tokens": 3748896.0, "step": 115 }, { "entropy": 1.071579933166504, "epoch": 0.06203208556149733, "grad_norm": 0.10608608275651932, "learning_rate": 0.0001995458419217564, "loss": 1.062, "mean_token_accuracy": 0.7356106638908386, "num_tokens": 3781466.0, "step": 116 }, { "entropy": 1.0784811079502106, "epoch": 0.06256684491978609, "grad_norm": 0.11198359727859497, "learning_rate": 0.00019953006005342036, "loss": 1.0794, "mean_token_accuracy": 0.7334589809179306, "num_tokens": 3813869.0, "step": 117 }, { "entropy": 1.073084980249405, "epoch": 0.06310160427807486, "grad_norm": 0.11027467995882034, "learning_rate": 0.0001995140093580727, "loss": 1.0646, "mean_token_accuracy": 0.7342070043087006, "num_tokens": 3846637.0, "step": 118 }, { "entropy": 1.0337157994508743, "epoch": 0.06363636363636363, "grad_norm": 0.1102069839835167, "learning_rate": 0.00019949768988390795, "loss": 1.0444, "mean_token_accuracy": 0.7443217188119888, "num_tokens": 3879176.0, "step": 119 }, { "entropy": 1.0600185543298721, "epoch": 0.06417112299465241, "grad_norm": 0.11147669702768326, "learning_rate": 0.00019948110167992768, "loss": 1.0755, "mean_token_accuracy": 0.7351716160774231, "num_tokens": 3911359.0, "step": 120 }, { "entropy": 1.0136645436286926, "epoch": 0.06470588235294118, "grad_norm": 0.11211156100034714, "learning_rate": 0.0001994642447959404, "loss": 1.0102, "mean_token_accuracy": 0.7462002784013748, "num_tokens": 3943619.0, "step": 121 }, { "entropy": 1.0536611080169678, "epoch": 0.06524064171122995, "grad_norm": 0.11316773295402527, "learning_rate": 0.00019944711928256134, "loss": 1.0641, "mean_token_accuracy": 0.7334738671779633, "num_tokens": 3976387.0, "step": 122 }, { "entropy": 1.0741864740848541, "epoch": 0.06577540106951872, "grad_norm": 0.11843208223581314, "learning_rate": 0.00019942972519121236, "loss": 1.0816, "mean_token_accuracy": 0.7349095940589905, "num_tokens": 4009155.0, "step": 123 }, { "entropy": 1.052805870771408, "epoch": 0.06631016042780749, "grad_norm": 0.10992563515901566, "learning_rate": 0.00019941206257412174, "loss": 1.0584, "mean_token_accuracy": 0.7372617423534393, "num_tokens": 4041923.0, "step": 124 }, { "entropy": 1.030860036611557, "epoch": 0.06684491978609626, "grad_norm": 0.10842890292406082, "learning_rate": 0.00019939413148432406, "loss": 1.0267, "mean_token_accuracy": 0.744562566280365, "num_tokens": 4074691.0, "step": 125 }, { "entropy": 1.0848415195941925, "epoch": 0.06737967914438503, "grad_norm": 0.11439650505781174, "learning_rate": 0.00019937593197566, "loss": 1.0704, "mean_token_accuracy": 0.7342680990695953, "num_tokens": 4107459.0, "step": 126 }, { "entropy": 1.0568927228450775, "epoch": 0.0679144385026738, "grad_norm": 0.11309509724378586, "learning_rate": 0.00019935746410277625, "loss": 1.042, "mean_token_accuracy": 0.738794133067131, "num_tokens": 4139840.0, "step": 127 }, { "entropy": 1.0811631679534912, "epoch": 0.06844919786096257, "grad_norm": 0.10690435022115707, "learning_rate": 0.00019933872792112525, "loss": 1.0668, "mean_token_accuracy": 0.7340264767408371, "num_tokens": 4172477.0, "step": 128 }, { "entropy": 1.0234280824661255, "epoch": 0.06898395721925134, "grad_norm": 0.10712087899446487, "learning_rate": 0.00019931972348696516, "loss": 1.0132, "mean_token_accuracy": 0.7456015348434448, "num_tokens": 4205042.0, "step": 129 }, { "entropy": 1.0305497348308563, "epoch": 0.06951871657754011, "grad_norm": 0.11901942640542984, "learning_rate": 0.00019930045085735952, "loss": 1.0288, "mean_token_accuracy": 0.741858035326004, "num_tokens": 4237786.0, "step": 130 }, { "entropy": 1.0717166364192963, "epoch": 0.07005347593582888, "grad_norm": 0.12446736544370651, "learning_rate": 0.00019928091009017719, "loss": 1.0685, "mean_token_accuracy": 0.7327694445848465, "num_tokens": 4270369.0, "step": 131 }, { "entropy": 1.0448037981987, "epoch": 0.07058823529411765, "grad_norm": 0.11024358868598938, "learning_rate": 0.00019926110124409216, "loss": 1.0584, "mean_token_accuracy": 0.7348179519176483, "num_tokens": 4303137.0, "step": 132 }, { "entropy": 1.0688583254814148, "epoch": 0.07112299465240642, "grad_norm": 0.11428970843553543, "learning_rate": 0.00019924102437858342, "loss": 1.0915, "mean_token_accuracy": 0.7348983436822891, "num_tokens": 4335600.0, "step": 133 }, { "entropy": 1.0266284346580505, "epoch": 0.07165775401069518, "grad_norm": 0.11974222213029861, "learning_rate": 0.00019922067955393462, "loss": 1.0375, "mean_token_accuracy": 0.7398888170719147, "num_tokens": 4368368.0, "step": 134 }, { "entropy": 1.040811687707901, "epoch": 0.07219251336898395, "grad_norm": 0.11608098447322845, "learning_rate": 0.00019920006683123406, "loss": 1.0513, "mean_token_accuracy": 0.7392490655183792, "num_tokens": 4400868.0, "step": 135 }, { "entropy": 1.0473428666591644, "epoch": 0.07272727272727272, "grad_norm": 0.10295426100492477, "learning_rate": 0.00019917918627237446, "loss": 1.0444, "mean_token_accuracy": 0.7357507795095444, "num_tokens": 4433511.0, "step": 136 }, { "entropy": 1.0344346463680267, "epoch": 0.0732620320855615, "grad_norm": 0.11141891032457352, "learning_rate": 0.00019915803794005274, "loss": 1.015, "mean_token_accuracy": 0.7442876398563385, "num_tokens": 4466279.0, "step": 137 }, { "entropy": 1.0611892938613892, "epoch": 0.07379679144385026, "grad_norm": 0.11746582388877869, "learning_rate": 0.00019913662189776978, "loss": 1.0577, "mean_token_accuracy": 0.7358400076627731, "num_tokens": 4499045.0, "step": 138 }, { "entropy": 1.0629407465457916, "epoch": 0.07433155080213903, "grad_norm": 0.10986413061618805, "learning_rate": 0.00019911493820983043, "loss": 1.0569, "mean_token_accuracy": 0.7403775751590729, "num_tokens": 4531813.0, "step": 139 }, { "entropy": 1.0572038888931274, "epoch": 0.0748663101604278, "grad_norm": 0.10804872959852219, "learning_rate": 0.00019909298694134307, "loss": 1.0489, "mean_token_accuracy": 0.7369809001684189, "num_tokens": 4564407.0, "step": 140 }, { "entropy": 1.04163858294487, "epoch": 0.07540106951871657, "grad_norm": 0.11756208539009094, "learning_rate": 0.0001990707681582196, "loss": 1.0528, "mean_token_accuracy": 0.739686906337738, "num_tokens": 4596642.0, "step": 141 }, { "entropy": 1.102006733417511, "epoch": 0.07593582887700535, "grad_norm": 0.14406466484069824, "learning_rate": 0.00019904828192717506, "loss": 1.1079, "mean_token_accuracy": 0.7283953577280045, "num_tokens": 4629322.0, "step": 142 }, { "entropy": 1.056261733174324, "epoch": 0.07647058823529412, "grad_norm": 0.10974816232919693, "learning_rate": 0.00019902552831572773, "loss": 1.0593, "mean_token_accuracy": 0.740469217300415, "num_tokens": 4662090.0, "step": 143 }, { "entropy": 1.0207159221172333, "epoch": 0.07700534759358289, "grad_norm": 0.11418622732162476, "learning_rate": 0.0001990025073921986, "loss": 1.0214, "mean_token_accuracy": 0.7460290491580963, "num_tokens": 4694445.0, "step": 144 }, { "entropy": 1.0311139822006226, "epoch": 0.07754010695187166, "grad_norm": 0.11726009100675583, "learning_rate": 0.00019897921922571135, "loss": 1.0314, "mean_token_accuracy": 0.7410428076982498, "num_tokens": 4727111.0, "step": 145 }, { "entropy": 1.0391079187393188, "epoch": 0.07807486631016043, "grad_norm": 0.10857763886451721, "learning_rate": 0.00019895566388619207, "loss": 1.0379, "mean_token_accuracy": 0.7357028722763062, "num_tokens": 4759610.0, "step": 146 }, { "entropy": 1.0950137078762054, "epoch": 0.0786096256684492, "grad_norm": 0.10514643788337708, "learning_rate": 0.00019893184144436903, "loss": 1.0951, "mean_token_accuracy": 0.7290331572294235, "num_tokens": 4792024.0, "step": 147 }, { "entropy": 1.0630358755588531, "epoch": 0.07914438502673797, "grad_norm": 0.11535201221704483, "learning_rate": 0.00019890775197177265, "loss": 1.0638, "mean_token_accuracy": 0.7384617477655411, "num_tokens": 4824569.0, "step": 148 }, { "entropy": 1.1177841126918793, "epoch": 0.07967914438502674, "grad_norm": 0.12657026946544647, "learning_rate": 0.00019888339554073505, "loss": 1.1044, "mean_token_accuracy": 0.7259286493062973, "num_tokens": 4857337.0, "step": 149 }, { "entropy": 1.0614158511161804, "epoch": 0.08021390374331551, "grad_norm": 0.11264298111200333, "learning_rate": 0.00019885877222438996, "loss": 1.0535, "mean_token_accuracy": 0.7389418482780457, "num_tokens": 4890105.0, "step": 150 }, { "entropy": 1.0805165469646454, "epoch": 0.08074866310160428, "grad_norm": 0.11258802562952042, "learning_rate": 0.00019883388209667246, "loss": 1.0775, "mean_token_accuracy": 0.7345370054244995, "num_tokens": 4922865.0, "step": 151 }, { "entropy": 1.0690855234861374, "epoch": 0.08128342245989305, "grad_norm": 0.13155648112297058, "learning_rate": 0.00019880872523231875, "loss": 1.0856, "mean_token_accuracy": 0.7277711480855942, "num_tokens": 4955260.0, "step": 152 }, { "entropy": 1.0855732262134552, "epoch": 0.08181818181818182, "grad_norm": 0.1189059242606163, "learning_rate": 0.00019878330170686602, "loss": 1.0923, "mean_token_accuracy": 0.7301136553287506, "num_tokens": 4988028.0, "step": 153 }, { "entropy": 1.0443801879882812, "epoch": 0.08235294117647059, "grad_norm": 0.11662495881319046, "learning_rate": 0.00019875761159665202, "loss": 1.0544, "mean_token_accuracy": 0.740158811211586, "num_tokens": 5020587.0, "step": 154 }, { "entropy": 1.0272859036922455, "epoch": 0.08288770053475936, "grad_norm": 0.12569770216941833, "learning_rate": 0.0001987316549788151, "loss": 1.0285, "mean_token_accuracy": 0.7470496594905853, "num_tokens": 5053027.0, "step": 155 }, { "entropy": 1.0688476860523224, "epoch": 0.08342245989304813, "grad_norm": 0.11243730783462524, "learning_rate": 0.00019870543193129377, "loss": 1.0608, "mean_token_accuracy": 0.738514170050621, "num_tokens": 5085795.0, "step": 156 }, { "entropy": 1.042402446269989, "epoch": 0.0839572192513369, "grad_norm": 0.11054984480142593, "learning_rate": 0.00019867894253282654, "loss": 1.0362, "mean_token_accuracy": 0.7432184815406799, "num_tokens": 5118563.0, "step": 157 }, { "entropy": 1.053287386894226, "epoch": 0.08449197860962566, "grad_norm": 0.1238577663898468, "learning_rate": 0.00019865218686295163, "loss": 1.046, "mean_token_accuracy": 0.7390121221542358, "num_tokens": 5150708.0, "step": 158 }, { "entropy": 1.0408558249473572, "epoch": 0.08502673796791443, "grad_norm": 0.11221247911453247, "learning_rate": 0.0001986251650020069, "loss": 1.0432, "mean_token_accuracy": 0.7450513243675232, "num_tokens": 5183476.0, "step": 159 }, { "entropy": 1.0343620330095291, "epoch": 0.0855614973262032, "grad_norm": 0.10995996743440628, "learning_rate": 0.00019859787703112937, "loss": 1.0217, "mean_token_accuracy": 0.7431268393993378, "num_tokens": 5216244.0, "step": 160 }, { "entropy": 1.0292404145002365, "epoch": 0.08609625668449197, "grad_norm": 0.11850112676620483, "learning_rate": 0.00019857032303225512, "loss": 1.037, "mean_token_accuracy": 0.7418358474969864, "num_tokens": 5249011.0, "step": 161 }, { "entropy": 1.0845747888088226, "epoch": 0.08663101604278074, "grad_norm": 0.11430878937244415, "learning_rate": 0.0001985425030881191, "loss": 1.0887, "mean_token_accuracy": 0.7280058711767197, "num_tokens": 5281779.0, "step": 162 }, { "entropy": 1.0065880417823792, "epoch": 0.08716577540106951, "grad_norm": 0.1066722646355629, "learning_rate": 0.00019851441728225465, "loss": 0.9979, "mean_token_accuracy": 0.7491233050823212, "num_tokens": 5314105.0, "step": 163 }, { "entropy": 1.0722036361694336, "epoch": 0.0877005347593583, "grad_norm": 0.11814260482788086, "learning_rate": 0.00019848606569899358, "loss": 1.0843, "mean_token_accuracy": 0.7380961626768112, "num_tokens": 5346493.0, "step": 164 }, { "entropy": 1.0834770202636719, "epoch": 0.08823529411764706, "grad_norm": 0.11347079277038574, "learning_rate": 0.00019845744842346554, "loss": 1.0796, "mean_token_accuracy": 0.7332694083452225, "num_tokens": 5379196.0, "step": 165 }, { "entropy": 1.0517558157444, "epoch": 0.08877005347593583, "grad_norm": 0.11098195612430573, "learning_rate": 0.0001984285655415981, "loss": 1.0576, "mean_token_accuracy": 0.7342869937419891, "num_tokens": 5411955.0, "step": 166 }, { "entropy": 1.0513665974140167, "epoch": 0.0893048128342246, "grad_norm": 0.11459293216466904, "learning_rate": 0.00019839941714011627, "loss": 1.0619, "mean_token_accuracy": 0.7377733439207077, "num_tokens": 5444628.0, "step": 167 }, { "entropy": 1.0738406777381897, "epoch": 0.08983957219251337, "grad_norm": 0.10587628185749054, "learning_rate": 0.00019837000330654244, "loss": 1.0757, "mean_token_accuracy": 0.7364008128643036, "num_tokens": 5477105.0, "step": 168 }, { "entropy": 1.0741328001022339, "epoch": 0.09037433155080214, "grad_norm": 0.10436589270830154, "learning_rate": 0.0001983403241291959, "loss": 1.0673, "mean_token_accuracy": 0.7362231314182281, "num_tokens": 5509873.0, "step": 169 }, { "entropy": 1.0744796991348267, "epoch": 0.09090909090909091, "grad_norm": 0.11066372692584991, "learning_rate": 0.00019831037969719262, "loss": 1.0661, "mean_token_accuracy": 0.7409461587667465, "num_tokens": 5542172.0, "step": 170 }, { "entropy": 1.0207346379756927, "epoch": 0.09144385026737968, "grad_norm": 0.10659626126289368, "learning_rate": 0.00019828017010044518, "loss": 1.014, "mean_token_accuracy": 0.7454484403133392, "num_tokens": 5574940.0, "step": 171 }, { "entropy": 1.0747233629226685, "epoch": 0.09197860962566845, "grad_norm": 0.10680273920297623, "learning_rate": 0.00019824969542966227, "loss": 1.086, "mean_token_accuracy": 0.735552579164505, "num_tokens": 5607622.0, "step": 172 }, { "entropy": 1.082386240363121, "epoch": 0.09251336898395722, "grad_norm": 0.10763442516326904, "learning_rate": 0.00019821895577634848, "loss": 1.0902, "mean_token_accuracy": 0.7295721918344498, "num_tokens": 5640310.0, "step": 173 }, { "entropy": 1.0541490614414215, "epoch": 0.09304812834224599, "grad_norm": 0.11163617670536041, "learning_rate": 0.0001981879512328041, "loss": 1.0566, "mean_token_accuracy": 0.7395776063203812, "num_tokens": 5672750.0, "step": 174 }, { "entropy": 1.0431351363658905, "epoch": 0.09358288770053476, "grad_norm": 0.1073818951845169, "learning_rate": 0.00019815668189212477, "loss": 1.0562, "mean_token_accuracy": 0.7366202473640442, "num_tokens": 5705518.0, "step": 175 }, { "entropy": 1.0744110643863678, "epoch": 0.09411764705882353, "grad_norm": 0.1086694523692131, "learning_rate": 0.0001981251478482013, "loss": 1.0858, "mean_token_accuracy": 0.729690670967102, "num_tokens": 5738149.0, "step": 176 }, { "entropy": 1.066655308008194, "epoch": 0.0946524064171123, "grad_norm": 0.10837189108133316, "learning_rate": 0.00019809334919571912, "loss": 1.0546, "mean_token_accuracy": 0.7364211529493332, "num_tokens": 5770470.0, "step": 177 }, { "entropy": 1.0561938136816025, "epoch": 0.09518716577540107, "grad_norm": 0.12487713992595673, "learning_rate": 0.00019806128603015838, "loss": 1.0408, "mean_token_accuracy": 0.7411956340074539, "num_tokens": 5802994.0, "step": 178 }, { "entropy": 1.051276072859764, "epoch": 0.09572192513368984, "grad_norm": 0.12799374759197235, "learning_rate": 0.0001980289584477934, "loss": 1.047, "mean_token_accuracy": 0.740519255399704, "num_tokens": 5835383.0, "step": 179 }, { "entropy": 1.124489665031433, "epoch": 0.0962566844919786, "grad_norm": 0.1299794316291809, "learning_rate": 0.0001979963665456924, "loss": 1.1152, "mean_token_accuracy": 0.731091171503067, "num_tokens": 5868151.0, "step": 180 }, { "entropy": 1.019109919667244, "epoch": 0.09679144385026738, "grad_norm": 0.1263756901025772, "learning_rate": 0.00019796351042171735, "loss": 1.0133, "mean_token_accuracy": 0.7450751811265945, "num_tokens": 5900659.0, "step": 181 }, { "entropy": 1.0870310068130493, "epoch": 0.09732620320855614, "grad_norm": 0.11404166370630264, "learning_rate": 0.00019793039017452344, "loss": 1.0762, "mean_token_accuracy": 0.7330767512321472, "num_tokens": 5933427.0, "step": 182 }, { "entropy": 1.0590984374284744, "epoch": 0.09786096256684491, "grad_norm": 0.1062210351228714, "learning_rate": 0.00019789700590355916, "loss": 1.0502, "mean_token_accuracy": 0.7384225428104401, "num_tokens": 5966195.0, "step": 183 }, { "entropy": 1.062455177307129, "epoch": 0.09839572192513368, "grad_norm": 0.11313077062368393, "learning_rate": 0.00019786335770906552, "loss": 1.066, "mean_token_accuracy": 0.7376256883144379, "num_tokens": 5998575.0, "step": 184 }, { "entropy": 1.1060601472854614, "epoch": 0.09893048128342247, "grad_norm": 0.11265819519758224, "learning_rate": 0.00019782944569207616, "loss": 1.1055, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 6031343.0, "step": 185 }, { "entropy": 1.030540555715561, "epoch": 0.09946524064171124, "grad_norm": 0.12320833653211594, "learning_rate": 0.00019779526995441685, "loss": 1.0115, "mean_token_accuracy": 0.7461815774440765, "num_tokens": 6064111.0, "step": 186 }, { "entropy": 1.0417098999023438, "epoch": 0.1, "grad_norm": 0.12461020052433014, "learning_rate": 0.00019776083059870513, "loss": 1.0605, "mean_token_accuracy": 0.7379734069108963, "num_tokens": 6096712.0, "step": 187 }, { "entropy": 1.046680524945259, "epoch": 0.10053475935828877, "grad_norm": 0.12361578643321991, "learning_rate": 0.0001977261277283503, "loss": 1.0588, "mean_token_accuracy": 0.7382086962461472, "num_tokens": 6129480.0, "step": 188 }, { "entropy": 1.0391662418842316, "epoch": 0.10106951871657754, "grad_norm": 0.12591002881526947, "learning_rate": 0.00019769116144755268, "loss": 1.0587, "mean_token_accuracy": 0.7403542697429657, "num_tokens": 6162164.0, "step": 189 }, { "entropy": 1.092732846736908, "epoch": 0.10160427807486631, "grad_norm": 0.11028733849525452, "learning_rate": 0.0001976559318613036, "loss": 1.1019, "mean_token_accuracy": 0.7272328734397888, "num_tokens": 6194327.0, "step": 190 }, { "entropy": 1.090900957584381, "epoch": 0.10213903743315508, "grad_norm": 0.10880324244499207, "learning_rate": 0.00019762043907538504, "loss": 1.079, "mean_token_accuracy": 0.7331654280424118, "num_tokens": 6227073.0, "step": 191 }, { "entropy": 1.1134484112262726, "epoch": 0.10267379679144385, "grad_norm": 0.1132679283618927, "learning_rate": 0.00019758468319636924, "loss": 1.1013, "mean_token_accuracy": 0.7296248972415924, "num_tokens": 6259841.0, "step": 192 }, { "entropy": 1.0712683200836182, "epoch": 0.10320855614973262, "grad_norm": 0.1124231144785881, "learning_rate": 0.00019754866433161842, "loss": 1.0755, "mean_token_accuracy": 0.734634667634964, "num_tokens": 6292609.0, "step": 193 }, { "entropy": 1.0586844384670258, "epoch": 0.10374331550802139, "grad_norm": 0.11450853943824768, "learning_rate": 0.00019751238258928447, "loss": 1.0439, "mean_token_accuracy": 0.7394875437021255, "num_tokens": 6325114.0, "step": 194 }, { "entropy": 1.0706075578927994, "epoch": 0.10427807486631016, "grad_norm": 0.12188895791769028, "learning_rate": 0.00019747583807830856, "loss": 1.0577, "mean_token_accuracy": 0.7379983961582184, "num_tokens": 6357514.0, "step": 195 }, { "entropy": 1.0366948395967484, "epoch": 0.10481283422459893, "grad_norm": 0.11499596387147903, "learning_rate": 0.00019743903090842094, "loss": 1.0249, "mean_token_accuracy": 0.7436766922473907, "num_tokens": 6390282.0, "step": 196 }, { "entropy": 1.0938681066036224, "epoch": 0.1053475935828877, "grad_norm": 0.10795161128044128, "learning_rate": 0.00019740196119014044, "loss": 1.0955, "mean_token_accuracy": 0.7291055768728256, "num_tokens": 6423050.0, "step": 197 }, { "entropy": 1.0152499079704285, "epoch": 0.10588235294117647, "grad_norm": 0.12174174934625626, "learning_rate": 0.00019736462903477428, "loss": 1.0136, "mean_token_accuracy": 0.7461740970611572, "num_tokens": 6455035.0, "step": 198 }, { "entropy": 1.0703981518745422, "epoch": 0.10641711229946524, "grad_norm": 0.10990523546934128, "learning_rate": 0.0001973270345544177, "loss": 1.0787, "mean_token_accuracy": 0.7326185256242752, "num_tokens": 6487803.0, "step": 199 }, { "entropy": 1.1004179418087006, "epoch": 0.10695187165775401, "grad_norm": 0.11005926132202148, "learning_rate": 0.0001972891778619536, "loss": 1.104, "mean_token_accuracy": 0.7294110655784607, "num_tokens": 6520571.0, "step": 200 }, { "entropy": 1.0091599971055984, "epoch": 0.10748663101604278, "grad_norm": 0.11055716127157211, "learning_rate": 0.00019725105907105215, "loss": 1.0109, "mean_token_accuracy": 0.7463954091072083, "num_tokens": 6553339.0, "step": 201 }, { "entropy": 1.0590536296367645, "epoch": 0.10802139037433155, "grad_norm": 0.11450552195310593, "learning_rate": 0.0001972126782961706, "loss": 1.0717, "mean_token_accuracy": 0.735278457403183, "num_tokens": 6585876.0, "step": 202 }, { "entropy": 1.049636334180832, "epoch": 0.10855614973262032, "grad_norm": 0.10723718255758286, "learning_rate": 0.00019717403565255276, "loss": 1.0435, "mean_token_accuracy": 0.7375023663043976, "num_tokens": 6618534.0, "step": 203 }, { "entropy": 1.0268239080905914, "epoch": 0.10909090909090909, "grad_norm": 0.110516756772995, "learning_rate": 0.00019713513125622882, "loss": 1.0285, "mean_token_accuracy": 0.7447504550218582, "num_tokens": 6651106.0, "step": 204 }, { "entropy": 1.0767436623573303, "epoch": 0.10962566844919786, "grad_norm": 0.1162244901061058, "learning_rate": 0.00019709596522401491, "loss": 1.0834, "mean_token_accuracy": 0.733895018696785, "num_tokens": 6683662.0, "step": 205 }, { "entropy": 1.0388650298118591, "epoch": 0.11016042780748662, "grad_norm": 0.12721914052963257, "learning_rate": 0.00019705653767351265, "loss": 1.0488, "mean_token_accuracy": 0.7420195192098618, "num_tokens": 6716238.0, "step": 206 }, { "entropy": 1.0942329466342926, "epoch": 0.11069518716577541, "grad_norm": 0.14424873888492584, "learning_rate": 0.00019701684872310904, "loss": 1.1077, "mean_token_accuracy": 0.7278364151716232, "num_tokens": 6748830.0, "step": 207 }, { "entropy": 1.0293540060520172, "epoch": 0.11122994652406418, "grad_norm": 0.14370134472846985, "learning_rate": 0.0001969768984919759, "loss": 1.037, "mean_token_accuracy": 0.7417389005422592, "num_tokens": 6781247.0, "step": 208 }, { "entropy": 1.0413859337568283, "epoch": 0.11176470588235295, "grad_norm": 0.13019073009490967, "learning_rate": 0.00019693668710006964, "loss": 1.0304, "mean_token_accuracy": 0.7385945320129395, "num_tokens": 6813848.0, "step": 209 }, { "entropy": 1.0882729589939117, "epoch": 0.11229946524064172, "grad_norm": 0.11451397836208344, "learning_rate": 0.00019689621466813077, "loss": 1.0789, "mean_token_accuracy": 0.7339456081390381, "num_tokens": 6846614.0, "step": 210 }, { "entropy": 1.0684372186660767, "epoch": 0.11283422459893049, "grad_norm": 0.11447669565677643, "learning_rate": 0.00019685548131768364, "loss": 1.0549, "mean_token_accuracy": 0.7408982962369919, "num_tokens": 6878989.0, "step": 211 }, { "entropy": 1.0850865244865417, "epoch": 0.11336898395721925, "grad_norm": 0.13719269633293152, "learning_rate": 0.00019681448717103609, "loss": 1.0645, "mean_token_accuracy": 0.7355644553899765, "num_tokens": 6911286.0, "step": 212 }, { "entropy": 1.0667450577020645, "epoch": 0.11390374331550802, "grad_norm": 0.14587928354740143, "learning_rate": 0.00019677323235127895, "loss": 1.0689, "mean_token_accuracy": 0.7330211699008942, "num_tokens": 6943962.0, "step": 213 }, { "entropy": 1.0296750366687775, "epoch": 0.1144385026737968, "grad_norm": 0.1201789379119873, "learning_rate": 0.00019673171698228584, "loss": 1.0215, "mean_token_accuracy": 0.7414021492004395, "num_tokens": 6976661.0, "step": 214 }, { "entropy": 1.042615681886673, "epoch": 0.11497326203208556, "grad_norm": 0.10887690633535385, "learning_rate": 0.0001966899411887127, "loss": 1.0306, "mean_token_accuracy": 0.7439039051532745, "num_tokens": 7009306.0, "step": 215 }, { "entropy": 1.0513212084770203, "epoch": 0.11550802139037433, "grad_norm": 0.127265065908432, "learning_rate": 0.00019664790509599738, "loss": 1.052, "mean_token_accuracy": 0.7402859330177307, "num_tokens": 7042074.0, "step": 216 }, { "entropy": 1.0337154269218445, "epoch": 0.1160427807486631, "grad_norm": 0.11888471245765686, "learning_rate": 0.0001966056088303594, "loss": 1.0388, "mean_token_accuracy": 0.7428157329559326, "num_tokens": 7074612.0, "step": 217 }, { "entropy": 1.02349191904068, "epoch": 0.11657754010695187, "grad_norm": 0.11285922676324844, "learning_rate": 0.0001965630525187994, "loss": 1.0401, "mean_token_accuracy": 0.7402248382568359, "num_tokens": 7107380.0, "step": 218 }, { "entropy": 1.0236869156360626, "epoch": 0.11711229946524064, "grad_norm": 0.12390273064374924, "learning_rate": 0.00019652023628909893, "loss": 1.0392, "mean_token_accuracy": 0.735905334353447, "num_tokens": 7139747.0, "step": 219 }, { "entropy": 1.031931921839714, "epoch": 0.11764705882352941, "grad_norm": 0.11192239075899124, "learning_rate": 0.00019647716026981987, "loss": 1.0439, "mean_token_accuracy": 0.7395527958869934, "num_tokens": 7172515.0, "step": 220 }, { "entropy": 1.0469720661640167, "epoch": 0.11818181818181818, "grad_norm": 0.11783554404973984, "learning_rate": 0.00019643382459030433, "loss": 1.0647, "mean_token_accuracy": 0.7378115952014923, "num_tokens": 7205283.0, "step": 221 }, { "entropy": 1.0731564462184906, "epoch": 0.11871657754010695, "grad_norm": 0.12124251574277878, "learning_rate": 0.0001963902293806739, "loss": 1.0743, "mean_token_accuracy": 0.734205812215805, "num_tokens": 7237546.0, "step": 222 }, { "entropy": 1.07041397690773, "epoch": 0.11925133689839572, "grad_norm": 0.114452064037323, "learning_rate": 0.00019634637477182953, "loss": 1.0555, "mean_token_accuracy": 0.7356832921504974, "num_tokens": 7270131.0, "step": 223 }, { "entropy": 1.0573597848415375, "epoch": 0.11978609625668449, "grad_norm": 0.11932668089866638, "learning_rate": 0.0001963022608954511, "loss": 1.0612, "mean_token_accuracy": 0.7345124781131744, "num_tokens": 7302899.0, "step": 224 }, { "entropy": 1.0339885354042053, "epoch": 0.12032085561497326, "grad_norm": 0.11648670583963394, "learning_rate": 0.00019625788788399696, "loss": 1.0235, "mean_token_accuracy": 0.7396539747714996, "num_tokens": 7335362.0, "step": 225 }, { "entropy": 1.0512037575244904, "epoch": 0.12085561497326203, "grad_norm": 0.10895999521017075, "learning_rate": 0.00019621325587070344, "loss": 1.0434, "mean_token_accuracy": 0.7386669218540192, "num_tokens": 7368130.0, "step": 226 }, { "entropy": 1.0894387364387512, "epoch": 0.1213903743315508, "grad_norm": 0.11288847029209137, "learning_rate": 0.00019616836498958465, "loss": 1.0788, "mean_token_accuracy": 0.7326047420501709, "num_tokens": 7400605.0, "step": 227 }, { "entropy": 1.0398660898208618, "epoch": 0.12192513368983957, "grad_norm": 0.11348799616098404, "learning_rate": 0.00019612321537543204, "loss": 1.0378, "mean_token_accuracy": 0.7441670298576355, "num_tokens": 7433152.0, "step": 228 }, { "entropy": 1.0839430689811707, "epoch": 0.12245989304812835, "grad_norm": 0.11604870110750198, "learning_rate": 0.00019607780716381388, "loss": 1.0873, "mean_token_accuracy": 0.7298715263605118, "num_tokens": 7465724.0, "step": 229 }, { "entropy": 1.0554787516593933, "epoch": 0.12299465240641712, "grad_norm": 0.11123524606227875, "learning_rate": 0.00019603214049107487, "loss": 1.0564, "mean_token_accuracy": 0.7370816320180893, "num_tokens": 7498434.0, "step": 230 }, { "entropy": 1.0759001970291138, "epoch": 0.12352941176470589, "grad_norm": 0.11666589230298996, "learning_rate": 0.00019598621549433585, "loss": 1.0862, "mean_token_accuracy": 0.7339395433664322, "num_tokens": 7531071.0, "step": 231 }, { "entropy": 1.061589390039444, "epoch": 0.12406417112299466, "grad_norm": 0.11102268844842911, "learning_rate": 0.00019594003231149332, "loss": 1.0663, "mean_token_accuracy": 0.7356441020965576, "num_tokens": 7563290.0, "step": 232 }, { "entropy": 1.0681027621030807, "epoch": 0.12459893048128343, "grad_norm": 0.11785712093114853, "learning_rate": 0.00019589359108121895, "loss": 1.0725, "mean_token_accuracy": 0.7335348129272461, "num_tokens": 7595961.0, "step": 233 }, { "entropy": 1.0394710004329681, "epoch": 0.12513368983957218, "grad_norm": 0.11143148690462112, "learning_rate": 0.00019584689194295936, "loss": 1.0345, "mean_token_accuracy": 0.7412939965724945, "num_tokens": 7628729.0, "step": 234 }, { "entropy": 1.0309483706951141, "epoch": 0.12566844919786097, "grad_norm": 0.1119936853647232, "learning_rate": 0.00019579993503693538, "loss": 1.0204, "mean_token_accuracy": 0.7455660849809647, "num_tokens": 7661131.0, "step": 235 }, { "entropy": 1.0677326619625092, "epoch": 0.12620320855614972, "grad_norm": 0.11484680324792862, "learning_rate": 0.00019575272050414207, "loss": 1.0735, "mean_token_accuracy": 0.7364104688167572, "num_tokens": 7693705.0, "step": 236 }, { "entropy": 1.0587773025035858, "epoch": 0.1267379679144385, "grad_norm": 0.11827201396226883, "learning_rate": 0.00019570524848634783, "loss": 1.0684, "mean_token_accuracy": 0.7397666275501251, "num_tokens": 7726473.0, "step": 237 }, { "entropy": 1.0162574499845505, "epoch": 0.12727272727272726, "grad_norm": 0.11548800766468048, "learning_rate": 0.0001956575191260943, "loss": 1.0103, "mean_token_accuracy": 0.7485337257385254, "num_tokens": 7759241.0, "step": 238 }, { "entropy": 1.032597839832306, "epoch": 0.12780748663101604, "grad_norm": 0.11080261319875717, "learning_rate": 0.00019560953256669587, "loss": 1.0355, "mean_token_accuracy": 0.738309696316719, "num_tokens": 7791975.0, "step": 239 }, { "entropy": 1.0555288195610046, "epoch": 0.12834224598930483, "grad_norm": 0.11107134819030762, "learning_rate": 0.00019556128895223914, "loss": 1.0604, "mean_token_accuracy": 0.7379528433084488, "num_tokens": 7824469.0, "step": 240 }, { "entropy": 1.1027131378650665, "epoch": 0.12887700534759358, "grad_norm": 0.11281178146600723, "learning_rate": 0.0001955127884275825, "loss": 1.1042, "mean_token_accuracy": 0.728361964225769, "num_tokens": 7856992.0, "step": 241 }, { "entropy": 1.067112773656845, "epoch": 0.12941176470588237, "grad_norm": 0.10883133858442307, "learning_rate": 0.00019546403113835593, "loss": 1.0644, "mean_token_accuracy": 0.7364980578422546, "num_tokens": 7889760.0, "step": 242 }, { "entropy": 1.014612227678299, "epoch": 0.12994652406417112, "grad_norm": 0.11845094710588455, "learning_rate": 0.00019541501723096017, "loss": 1.0173, "mean_token_accuracy": 0.7426800578832626, "num_tokens": 7922118.0, "step": 243 }, { "entropy": 1.0526921898126602, "epoch": 0.1304812834224599, "grad_norm": 0.1122470572590828, "learning_rate": 0.0001953657468525667, "loss": 1.0454, "mean_token_accuracy": 0.7415078282356262, "num_tokens": 7954886.0, "step": 244 }, { "entropy": 1.066915899515152, "epoch": 0.13101604278074866, "grad_norm": 0.11756544560194016, "learning_rate": 0.0001953162201511169, "loss": 1.0708, "mean_token_accuracy": 0.7362254858016968, "num_tokens": 7987371.0, "step": 245 }, { "entropy": 1.0463462471961975, "epoch": 0.13155080213903744, "grad_norm": 0.11988546699285507, "learning_rate": 0.00019526643727532197, "loss": 1.0468, "mean_token_accuracy": 0.7412023544311523, "num_tokens": 8020139.0, "step": 246 }, { "entropy": 1.0550406724214554, "epoch": 0.1320855614973262, "grad_norm": 0.13351471722126007, "learning_rate": 0.00019521639837466213, "loss": 1.0703, "mean_token_accuracy": 0.7336876839399338, "num_tokens": 8052907.0, "step": 247 }, { "entropy": 1.042212188243866, "epoch": 0.13262032085561498, "grad_norm": 0.125226229429245, "learning_rate": 0.00019516610359938648, "loss": 1.0332, "mean_token_accuracy": 0.7394306063652039, "num_tokens": 8085675.0, "step": 248 }, { "entropy": 1.0784787833690643, "epoch": 0.13315508021390374, "grad_norm": 0.11965163797140121, "learning_rate": 0.00019511555310051246, "loss": 1.0729, "mean_token_accuracy": 0.7330728024244308, "num_tokens": 8118247.0, "step": 249 }, { "entropy": 1.0723508298397064, "epoch": 0.13368983957219252, "grad_norm": 0.12876088917255402, "learning_rate": 0.00019506474702982523, "loss": 1.0664, "mean_token_accuracy": 0.7342070043087006, "num_tokens": 8151015.0, "step": 250 }, { "entropy": 1.0198566615581512, "epoch": 0.13422459893048128, "grad_norm": 0.12335485965013504, "learning_rate": 0.00019501368553987737, "loss": 1.0205, "mean_token_accuracy": 0.7444403767585754, "num_tokens": 8183783.0, "step": 251 }, { "entropy": 1.0937196016311646, "epoch": 0.13475935828877006, "grad_norm": 0.11913212388753891, "learning_rate": 0.00019496236878398848, "loss": 1.0806, "mean_token_accuracy": 0.7335934787988663, "num_tokens": 8216273.0, "step": 252 }, { "entropy": 1.0336759388446808, "epoch": 0.13529411764705881, "grad_norm": 0.11281498521566391, "learning_rate": 0.00019491079691624454, "loss": 1.0276, "mean_token_accuracy": 0.7413501590490341, "num_tokens": 8248701.0, "step": 253 }, { "entropy": 0.9990439265966415, "epoch": 0.1358288770053476, "grad_norm": 0.11167177557945251, "learning_rate": 0.0001948589700914976, "loss": 1.0042, "mean_token_accuracy": 0.7490535080432892, "num_tokens": 8280796.0, "step": 254 }, { "entropy": 1.0568778216838837, "epoch": 0.13636363636363635, "grad_norm": 0.11134691536426544, "learning_rate": 0.0001948068884653652, "loss": 1.0593, "mean_token_accuracy": 0.7315690666437149, "num_tokens": 8313379.0, "step": 255 }, { "entropy": 1.03424671292305, "epoch": 0.13689839572192514, "grad_norm": 0.11792710423469543, "learning_rate": 0.00019475455219422998, "loss": 1.0441, "mean_token_accuracy": 0.740011528134346, "num_tokens": 8345856.0, "step": 256 }, { "entropy": 1.0480815172195435, "epoch": 0.1374331550802139, "grad_norm": 0.12474671006202698, "learning_rate": 0.00019470196143523923, "loss": 1.047, "mean_token_accuracy": 0.7390372902154922, "num_tokens": 8378372.0, "step": 257 }, { "entropy": 1.0477408915758133, "epoch": 0.13796791443850268, "grad_norm": 0.1266440451145172, "learning_rate": 0.00019464911634630433, "loss": 1.043, "mean_token_accuracy": 0.7413789927959442, "num_tokens": 8410900.0, "step": 258 }, { "entropy": 1.0490031838417053, "epoch": 0.13850267379679143, "grad_norm": 0.14051200449466705, "learning_rate": 0.00019459601708610036, "loss": 1.0402, "mean_token_accuracy": 0.7361215502023697, "num_tokens": 8443641.0, "step": 259 }, { "entropy": 1.0794505178928375, "epoch": 0.13903743315508021, "grad_norm": 0.158530592918396, "learning_rate": 0.0001945426638140655, "loss": 1.0739, "mean_token_accuracy": 0.7352101802825928, "num_tokens": 8476266.0, "step": 260 }, { "entropy": 1.0522571057081223, "epoch": 0.139572192513369, "grad_norm": 0.1495906412601471, "learning_rate": 0.00019448905669040077, "loss": 1.0482, "mean_token_accuracy": 0.7383309006690979, "num_tokens": 8509034.0, "step": 261 }, { "entropy": 1.0746977925300598, "epoch": 0.14010695187165775, "grad_norm": 0.12712834775447845, "learning_rate": 0.00019443519587606934, "loss": 1.083, "mean_token_accuracy": 0.7317632138729095, "num_tokens": 8541802.0, "step": 262 }, { "entropy": 1.03669573366642, "epoch": 0.14064171122994654, "grad_norm": 0.11646346002817154, "learning_rate": 0.0001943810815327961, "loss": 1.043, "mean_token_accuracy": 0.740713581442833, "num_tokens": 8574570.0, "step": 263 }, { "entropy": 1.0398886501789093, "epoch": 0.1411764705882353, "grad_norm": 0.13226716220378876, "learning_rate": 0.0001943267138230672, "loss": 1.0401, "mean_token_accuracy": 0.7394306063652039, "num_tokens": 8607338.0, "step": 264 }, { "entropy": 1.0552105903625488, "epoch": 0.14171122994652408, "grad_norm": 0.14164507389068604, "learning_rate": 0.00019427209291012965, "loss": 1.0592, "mean_token_accuracy": 0.7373839169740677, "num_tokens": 8640106.0, "step": 265 }, { "entropy": 1.0696653425693512, "epoch": 0.14224598930481283, "grad_norm": 0.11818902939558029, "learning_rate": 0.00019421721895799063, "loss": 1.067, "mean_token_accuracy": 0.7361311912536621, "num_tokens": 8672636.0, "step": 266 }, { "entropy": 1.0401161164045334, "epoch": 0.14278074866310161, "grad_norm": 0.1131942868232727, "learning_rate": 0.0001941620921314172, "loss": 1.0391, "mean_token_accuracy": 0.7399394363164902, "num_tokens": 8705347.0, "step": 267 }, { "entropy": 1.0111385881900787, "epoch": 0.14331550802139037, "grad_norm": 0.13214008510112762, "learning_rate": 0.00019410671259593562, "loss": 1.0127, "mean_token_accuracy": 0.7480546683073044, "num_tokens": 8737150.0, "step": 268 }, { "entropy": 1.02693110704422, "epoch": 0.14385026737967915, "grad_norm": 0.11571832001209259, "learning_rate": 0.000194051080517831, "loss": 1.0207, "mean_token_accuracy": 0.7417827546596527, "num_tokens": 8769918.0, "step": 269 }, { "entropy": 1.0444089770317078, "epoch": 0.1443850267379679, "grad_norm": 0.11474455893039703, "learning_rate": 0.00019399519606414674, "loss": 1.0607, "mean_token_accuracy": 0.7389113008975983, "num_tokens": 8802686.0, "step": 270 }, { "entropy": 1.0395529866218567, "epoch": 0.1449197860962567, "grad_norm": 0.12115021049976349, "learning_rate": 0.00019393905940268404, "loss": 1.0433, "mean_token_accuracy": 0.735184520483017, "num_tokens": 8835454.0, "step": 271 }, { "entropy": 1.0157050043344498, "epoch": 0.14545454545454545, "grad_norm": 0.11376800388097763, "learning_rate": 0.00019388267070200135, "loss": 1.0048, "mean_token_accuracy": 0.7480817437171936, "num_tokens": 8867817.0, "step": 272 }, { "entropy": 1.0406829416751862, "epoch": 0.14598930481283423, "grad_norm": 0.12187753617763519, "learning_rate": 0.00019382603013141402, "loss": 1.0516, "mean_token_accuracy": 0.7399662286043167, "num_tokens": 8900363.0, "step": 273 }, { "entropy": 1.0107262432575226, "epoch": 0.146524064171123, "grad_norm": 0.11920078098773956, "learning_rate": 0.00019376913786099348, "loss": 1.0196, "mean_token_accuracy": 0.7423813045024872, "num_tokens": 8932749.0, "step": 274 }, { "entropy": 1.0222293138504028, "epoch": 0.14705882352941177, "grad_norm": 0.11740428954362869, "learning_rate": 0.0001937119940615671, "loss": 1.0285, "mean_token_accuracy": 0.7422104030847549, "num_tokens": 8965517.0, "step": 275 }, { "entropy": 1.0424260348081589, "epoch": 0.14759358288770053, "grad_norm": 0.12438567727804184, "learning_rate": 0.00019365459890471746, "loss": 1.0362, "mean_token_accuracy": 0.7398019582033157, "num_tokens": 8997876.0, "step": 276 }, { "entropy": 1.0288828313350677, "epoch": 0.1481283422459893, "grad_norm": 0.11123205721378326, "learning_rate": 0.00019359695256278178, "loss": 1.0216, "mean_token_accuracy": 0.7422104179859161, "num_tokens": 9030644.0, "step": 277 }, { "entropy": 1.020155742764473, "epoch": 0.14866310160427806, "grad_norm": 0.11991836130619049, "learning_rate": 0.00019353905520885166, "loss": 1.0231, "mean_token_accuracy": 0.7435545027256012, "num_tokens": 9063412.0, "step": 278 }, { "entropy": 1.0389754474163055, "epoch": 0.14919786096256685, "grad_norm": 0.1264876425266266, "learning_rate": 0.00019348090701677223, "loss": 1.0405, "mean_token_accuracy": 0.7415297776460648, "num_tokens": 9096021.0, "step": 279 }, { "entropy": 0.9926124960184097, "epoch": 0.1497326203208556, "grad_norm": 0.11023156344890594, "learning_rate": 0.00019342250816114197, "loss": 0.9886, "mean_token_accuracy": 0.7518328428268433, "num_tokens": 9128789.0, "step": 280 }, { "entropy": 0.9978072941303253, "epoch": 0.15026737967914439, "grad_norm": 0.12432621419429779, "learning_rate": 0.00019336385881731186, "loss": 0.9991, "mean_token_accuracy": 0.7462638318538666, "num_tokens": 9161515.0, "step": 281 }, { "entropy": 1.0047390162944794, "epoch": 0.15080213903743314, "grad_norm": 0.12100157886743546, "learning_rate": 0.00019330495916138513, "loss": 0.9988, "mean_token_accuracy": 0.7465151995420456, "num_tokens": 9193969.0, "step": 282 }, { "entropy": 1.051861196756363, "epoch": 0.15133689839572192, "grad_norm": 0.11842017620801926, "learning_rate": 0.00019324580937021653, "loss": 1.0475, "mean_token_accuracy": 0.7398037314414978, "num_tokens": 9226307.0, "step": 283 }, { "entropy": 1.1205312013626099, "epoch": 0.1518716577540107, "grad_norm": 0.12409147620201111, "learning_rate": 0.00019318640962141188, "loss": 1.1388, "mean_token_accuracy": 0.7234237641096115, "num_tokens": 9259075.0, "step": 284 }, { "entropy": 1.0474530458450317, "epoch": 0.15240641711229946, "grad_norm": 0.12057706713676453, "learning_rate": 0.0001931267600933276, "loss": 1.041, "mean_token_accuracy": 0.7406096160411835, "num_tokens": 9291580.0, "step": 285 }, { "entropy": 1.071075975894928, "epoch": 0.15294117647058825, "grad_norm": 0.11215157806873322, "learning_rate": 0.00019306686096506997, "loss": 1.0546, "mean_token_accuracy": 0.7369257062673569, "num_tokens": 9324348.0, "step": 286 }, { "entropy": 1.0828769505023956, "epoch": 0.153475935828877, "grad_norm": 0.12447725981473923, "learning_rate": 0.00019300671241649495, "loss": 1.0681, "mean_token_accuracy": 0.7351312190294266, "num_tokens": 9356912.0, "step": 287 }, { "entropy": 1.0940111577510834, "epoch": 0.15401069518716579, "grad_norm": 0.118400439620018, "learning_rate": 0.00019294631462820721, "loss": 1.0894, "mean_token_accuracy": 0.7328831255435944, "num_tokens": 9389393.0, "step": 288 }, { "entropy": 1.068321093916893, "epoch": 0.15454545454545454, "grad_norm": 0.11101941764354706, "learning_rate": 0.00019288566778155993, "loss": 1.0553, "mean_token_accuracy": 0.7367358803749084, "num_tokens": 9422119.0, "step": 289 }, { "entropy": 1.0740725696086884, "epoch": 0.15508021390374332, "grad_norm": 0.1184263750910759, "learning_rate": 0.00019282477205865406, "loss": 1.0899, "mean_token_accuracy": 0.7305536270141602, "num_tokens": 9454682.0, "step": 290 }, { "entropy": 1.0920554399490356, "epoch": 0.15561497326203208, "grad_norm": 0.12500989437103271, "learning_rate": 0.00019276362764233783, "loss": 1.0918, "mean_token_accuracy": 0.7287611365318298, "num_tokens": 9487286.0, "step": 291 }, { "entropy": 0.9881408363580704, "epoch": 0.15614973262032086, "grad_norm": 0.12167588621377945, "learning_rate": 0.00019270223471620624, "loss": 1.0068, "mean_token_accuracy": 0.7483971863985062, "num_tokens": 9520021.0, "step": 292 }, { "entropy": 1.069222331047058, "epoch": 0.15668449197860962, "grad_norm": 0.11795824766159058, "learning_rate": 0.0001926405934646005, "loss": 1.0925, "mean_token_accuracy": 0.7297313064336777, "num_tokens": 9552590.0, "step": 293 }, { "entropy": 1.0657239258289337, "epoch": 0.1572192513368984, "grad_norm": 0.12100639939308167, "learning_rate": 0.00019257870407260742, "loss": 1.053, "mean_token_accuracy": 0.7422927170991898, "num_tokens": 9585309.0, "step": 294 }, { "entropy": 1.0440329909324646, "epoch": 0.15775401069518716, "grad_norm": 0.11367267370223999, "learning_rate": 0.00019251656672605887, "loss": 1.0335, "mean_token_accuracy": 0.7432643920183182, "num_tokens": 9617995.0, "step": 295 }, { "entropy": 1.0245574712753296, "epoch": 0.15828877005347594, "grad_norm": 0.11685089766979218, "learning_rate": 0.00019245418161153126, "loss": 1.0271, "mean_token_accuracy": 0.7453873455524445, "num_tokens": 9650763.0, "step": 296 }, { "entropy": 1.0340452790260315, "epoch": 0.1588235294117647, "grad_norm": 0.11204793304204941, "learning_rate": 0.00019239154891634498, "loss": 1.0265, "mean_token_accuracy": 0.7396213859319687, "num_tokens": 9683363.0, "step": 297 }, { "entropy": 0.9923126399517059, "epoch": 0.15935828877005348, "grad_norm": 0.1120043396949768, "learning_rate": 0.00019232866882856376, "loss": 0.9909, "mean_token_accuracy": 0.7515579164028168, "num_tokens": 9716131.0, "step": 298 }, { "entropy": 1.0405441522598267, "epoch": 0.15989304812834224, "grad_norm": 0.11795374006032944, "learning_rate": 0.00019226554153699423, "loss": 1.0353, "mean_token_accuracy": 0.7428170144557953, "num_tokens": 9748696.0, "step": 299 }, { "entropy": 1.0656865388154984, "epoch": 0.16042780748663102, "grad_norm": 0.11844547092914581, "learning_rate": 0.00019220216723118527, "loss": 1.063, "mean_token_accuracy": 0.7360092848539352, "num_tokens": 9781464.0, "step": 300 }, { "entropy": 1.0744484663009644, "epoch": 0.16096256684491977, "grad_norm": 0.1192609965801239, "learning_rate": 0.00019213854610142738, "loss": 1.0867, "mean_token_accuracy": 0.7393917888402939, "num_tokens": 9814042.0, "step": 301 }, { "entropy": 0.9988169819116592, "epoch": 0.16149732620320856, "grad_norm": 0.1213647648692131, "learning_rate": 0.0001920746783387523, "loss": 1.0146, "mean_token_accuracy": 0.7446236610412598, "num_tokens": 9846810.0, "step": 302 }, { "entropy": 0.9821544885635376, "epoch": 0.1620320855614973, "grad_norm": 0.1157236248254776, "learning_rate": 0.00019201056413493224, "loss": 0.9847, "mean_token_accuracy": 0.7526452541351318, "num_tokens": 9879232.0, "step": 303 }, { "entropy": 1.0186610370874405, "epoch": 0.1625668449197861, "grad_norm": 0.1273307055234909, "learning_rate": 0.00019194620368247938, "loss": 1.0323, "mean_token_accuracy": 0.7344208359718323, "num_tokens": 9912000.0, "step": 304 }, { "entropy": 0.9831012636423111, "epoch": 0.16310160427807488, "grad_norm": 0.1286727786064148, "learning_rate": 0.0001918815971746454, "loss": 0.9839, "mean_token_accuracy": 0.7520772218704224, "num_tokens": 9944768.0, "step": 305 }, { "entropy": 1.0608522593975067, "epoch": 0.16363636363636364, "grad_norm": 0.12535078823566437, "learning_rate": 0.0001918167448054207, "loss": 1.0415, "mean_token_accuracy": 0.7414893060922623, "num_tokens": 9977158.0, "step": 306 }, { "entropy": 1.082308292388916, "epoch": 0.16417112299465242, "grad_norm": 0.13904598355293274, "learning_rate": 0.00019175164676953389, "loss": 1.0945, "mean_token_accuracy": 0.7317221611738205, "num_tokens": 10009789.0, "step": 307 }, { "entropy": 1.021088719367981, "epoch": 0.16470588235294117, "grad_norm": 0.15948276221752167, "learning_rate": 0.00019168630326245133, "loss": 1.0002, "mean_token_accuracy": 0.7490701675415039, "num_tokens": 10042367.0, "step": 308 }, { "entropy": 1.0862839818000793, "epoch": 0.16524064171122996, "grad_norm": 0.16437754034996033, "learning_rate": 0.0001916207144803764, "loss": 1.0709, "mean_token_accuracy": 0.7315387725830078, "num_tokens": 10074834.0, "step": 309 }, { "entropy": 1.0433639287948608, "epoch": 0.1657754010695187, "grad_norm": 0.14935532212257385, "learning_rate": 0.0001915548806202489, "loss": 1.0226, "mean_token_accuracy": 0.744517520070076, "num_tokens": 10107112.0, "step": 310 }, { "entropy": 1.0890674889087677, "epoch": 0.1663101604278075, "grad_norm": 0.11999236047267914, "learning_rate": 0.00019148880187974462, "loss": 1.0863, "mean_token_accuracy": 0.7288990914821625, "num_tokens": 10139343.0, "step": 311 }, { "entropy": 1.0622902810573578, "epoch": 0.16684491978609625, "grad_norm": 0.11667732894420624, "learning_rate": 0.00019142247845727452, "loss": 1.0516, "mean_token_accuracy": 0.7366507947444916, "num_tokens": 10172111.0, "step": 312 }, { "entropy": 1.0628046095371246, "epoch": 0.16737967914438504, "grad_norm": 0.12990324199199677, "learning_rate": 0.00019135591055198433, "loss": 1.0667, "mean_token_accuracy": 0.7355510890483856, "num_tokens": 10204879.0, "step": 313 }, { "entropy": 1.0701580345630646, "epoch": 0.1679144385026738, "grad_norm": 0.12336275726556778, "learning_rate": 0.00019128909836375391, "loss": 1.0886, "mean_token_accuracy": 0.7303274720907211, "num_tokens": 10237647.0, "step": 314 }, { "entropy": 1.0163717865943909, "epoch": 0.16844919786096257, "grad_norm": 0.12419279664754868, "learning_rate": 0.00019122204209319648, "loss": 1.0398, "mean_token_accuracy": 0.7439745962619781, "num_tokens": 10270290.0, "step": 315 }, { "entropy": 0.9949931651353836, "epoch": 0.16898395721925133, "grad_norm": 0.12302132695913315, "learning_rate": 0.00019115474194165822, "loss": 1.002, "mean_token_accuracy": 0.7492363154888153, "num_tokens": 10303058.0, "step": 316 }, { "entropy": 1.0401550829410553, "epoch": 0.1695187165775401, "grad_norm": 0.13105161488056183, "learning_rate": 0.00019108719811121772, "loss": 1.0611, "mean_token_accuracy": 0.7385796755552292, "num_tokens": 10335660.0, "step": 317 }, { "entropy": 1.046198695898056, "epoch": 0.17005347593582887, "grad_norm": 0.11633765697479248, "learning_rate": 0.0001910194108046851, "loss": 1.0372, "mean_token_accuracy": 0.7395435571670532, "num_tokens": 10368347.0, "step": 318 }, { "entropy": 1.1130651533603668, "epoch": 0.17058823529411765, "grad_norm": 0.12539620697498322, "learning_rate": 0.00019095138022560154, "loss": 1.1134, "mean_token_accuracy": 0.724764883518219, "num_tokens": 10401040.0, "step": 319 }, { "entropy": 1.0333769619464874, "epoch": 0.1711229946524064, "grad_norm": 0.12401431053876877, "learning_rate": 0.00019088310657823876, "loss": 1.0277, "mean_token_accuracy": 0.7436408698558807, "num_tokens": 10433536.0, "step": 320 }, { "entropy": 1.0465964674949646, "epoch": 0.1716577540106952, "grad_norm": 0.12771785259246826, "learning_rate": 0.00019081459006759833, "loss": 1.0341, "mean_token_accuracy": 0.7408052384853363, "num_tokens": 10466304.0, "step": 321 }, { "entropy": 1.0823104679584503, "epoch": 0.17219251336898395, "grad_norm": 0.1222015768289566, "learning_rate": 0.000190745830899411, "loss": 1.0656, "mean_token_accuracy": 0.7383493930101395, "num_tokens": 10498790.0, "step": 322 }, { "entropy": 1.1021881699562073, "epoch": 0.17272727272727273, "grad_norm": 0.1238536611199379, "learning_rate": 0.0001906768292801362, "loss": 1.0898, "mean_token_accuracy": 0.7280010730028152, "num_tokens": 10531248.0, "step": 323 }, { "entropy": 1.0325301140546799, "epoch": 0.17326203208556148, "grad_norm": 0.115191251039505, "learning_rate": 0.0001906075854169613, "loss": 1.0249, "mean_token_accuracy": 0.7412202507257462, "num_tokens": 10563888.0, "step": 324 }, { "entropy": 1.032944694161415, "epoch": 0.17379679144385027, "grad_norm": 0.12355396151542664, "learning_rate": 0.00019053809951780106, "loss": 1.0509, "mean_token_accuracy": 0.7413963824510574, "num_tokens": 10596639.0, "step": 325 }, { "entropy": 1.0835492014884949, "epoch": 0.17433155080213902, "grad_norm": 0.12105121463537216, "learning_rate": 0.00019046837179129695, "loss": 1.0933, "mean_token_accuracy": 0.7261119484901428, "num_tokens": 10629407.0, "step": 326 }, { "entropy": 1.0530675500631332, "epoch": 0.1748663101604278, "grad_norm": 0.1176065057516098, "learning_rate": 0.00019039840244681664, "loss": 1.0537, "mean_token_accuracy": 0.7409262210130692, "num_tokens": 10661368.0, "step": 327 }, { "entropy": 1.0586835443973541, "epoch": 0.1754010695187166, "grad_norm": 0.11797241121530533, "learning_rate": 0.00019032819169445322, "loss": 1.0729, "mean_token_accuracy": 0.73880934715271, "num_tokens": 10694127.0, "step": 328 }, { "entropy": 1.0565836429595947, "epoch": 0.17593582887700535, "grad_norm": 0.12271396815776825, "learning_rate": 0.00019025773974502472, "loss": 1.0577, "mean_token_accuracy": 0.7349401414394379, "num_tokens": 10726895.0, "step": 329 }, { "entropy": 1.055390864610672, "epoch": 0.17647058823529413, "grad_norm": 0.12326763570308685, "learning_rate": 0.0001901870468100733, "loss": 1.057, "mean_token_accuracy": 0.7380874454975128, "num_tokens": 10759200.0, "step": 330 }, { "entropy": 1.0646967589855194, "epoch": 0.17700534759358288, "grad_norm": 0.11958156526088715, "learning_rate": 0.00019011611310186478, "loss": 1.0731, "mean_token_accuracy": 0.7317423224449158, "num_tokens": 10791603.0, "step": 331 }, { "entropy": 1.0165407955646515, "epoch": 0.17754010695187167, "grad_norm": 0.1222132071852684, "learning_rate": 0.00019004493883338796, "loss": 1.0066, "mean_token_accuracy": 0.7471896409988403, "num_tokens": 10824371.0, "step": 332 }, { "entropy": 1.048438012599945, "epoch": 0.17807486631016042, "grad_norm": 0.12711326777935028, "learning_rate": 0.0001899735242183539, "loss": 1.0211, "mean_token_accuracy": 0.7453262507915497, "num_tokens": 10857139.0, "step": 333 }, { "entropy": 1.016156643629074, "epoch": 0.1786096256684492, "grad_norm": 0.11839782446622849, "learning_rate": 0.00018990186947119534, "loss": 1.0209, "mean_token_accuracy": 0.741521418094635, "num_tokens": 10889784.0, "step": 334 }, { "entropy": 1.0444038808345795, "epoch": 0.17914438502673796, "grad_norm": 0.11765804886817932, "learning_rate": 0.0001898299748070661, "loss": 1.0497, "mean_token_accuracy": 0.7371090054512024, "num_tokens": 10922552.0, "step": 335 }, { "entropy": 1.0168912708759308, "epoch": 0.17967914438502675, "grad_norm": 0.11785600334405899, "learning_rate": 0.00018975784044184031, "loss": 1.0209, "mean_token_accuracy": 0.7431573867797852, "num_tokens": 10955320.0, "step": 336 }, { "entropy": 1.0603578388690948, "epoch": 0.1802139037433155, "grad_norm": 0.343005895614624, "learning_rate": 0.00018968546659211194, "loss": 1.0687, "mean_token_accuracy": 0.7360398471355438, "num_tokens": 10988088.0, "step": 337 }, { "entropy": 1.022160217165947, "epoch": 0.18074866310160428, "grad_norm": 0.12493393570184708, "learning_rate": 0.00018961285347519392, "loss": 1.028, "mean_token_accuracy": 0.7398343086242676, "num_tokens": 11020497.0, "step": 338 }, { "entropy": 0.9765367656946182, "epoch": 0.18128342245989304, "grad_norm": 0.12323349714279175, "learning_rate": 0.00018954000130911772, "loss": 0.9741, "mean_token_accuracy": 0.7540017068386078, "num_tokens": 11053265.0, "step": 339 }, { "entropy": 1.0765683948993683, "epoch": 0.18181818181818182, "grad_norm": 0.12275473773479462, "learning_rate": 0.00018946691031263252, "loss": 1.084, "mean_token_accuracy": 0.7321603298187256, "num_tokens": 11086033.0, "step": 340 }, { "entropy": 1.035182625055313, "epoch": 0.18235294117647058, "grad_norm": 0.11736733466386795, "learning_rate": 0.0001893935807052047, "loss": 1.0236, "mean_token_accuracy": 0.7407746911048889, "num_tokens": 11118801.0, "step": 341 }, { "entropy": 1.0754371583461761, "epoch": 0.18288770053475936, "grad_norm": 0.11700764298439026, "learning_rate": 0.00018932001270701706, "loss": 1.0737, "mean_token_accuracy": 0.7352294772863388, "num_tokens": 11151427.0, "step": 342 }, { "entropy": 1.0477129220962524, "epoch": 0.18342245989304812, "grad_norm": 0.12123466283082962, "learning_rate": 0.00018924620653896815, "loss": 1.0226, "mean_token_accuracy": 0.7426380813121796, "num_tokens": 11184195.0, "step": 343 }, { "entropy": 1.0352286100387573, "epoch": 0.1839572192513369, "grad_norm": 0.13648684322834015, "learning_rate": 0.00018917216242267179, "loss": 1.0256, "mean_token_accuracy": 0.7435887604951859, "num_tokens": 11216545.0, "step": 344 }, { "entropy": 0.979494497179985, "epoch": 0.18449197860962566, "grad_norm": 0.16731642186641693, "learning_rate": 0.00018909788058045612, "loss": 0.9789, "mean_token_accuracy": 0.7541547864675522, "num_tokens": 11248991.0, "step": 345 }, { "entropy": 0.9837481826543808, "epoch": 0.18502673796791444, "grad_norm": 0.19374457001686096, "learning_rate": 0.0001890233612353632, "loss": 0.9886, "mean_token_accuracy": 0.7531463801860809, "num_tokens": 11281759.0, "step": 346 }, { "entropy": 1.0341242849826813, "epoch": 0.1855614973262032, "grad_norm": 0.18110328912734985, "learning_rate": 0.00018894860461114818, "loss": 1.0489, "mean_token_accuracy": 0.7438734620809555, "num_tokens": 11314486.0, "step": 347 }, { "entropy": 0.9882454127073288, "epoch": 0.18609625668449198, "grad_norm": 0.12236490100622177, "learning_rate": 0.00018887361093227872, "loss": 1.0048, "mean_token_accuracy": 0.7507801651954651, "num_tokens": 11347165.0, "step": 348 }, { "entropy": 1.0009735822677612, "epoch": 0.18663101604278076, "grad_norm": 0.13956494629383087, "learning_rate": 0.00018879838042393414, "loss": 1.0179, "mean_token_accuracy": 0.7439083456993103, "num_tokens": 11379742.0, "step": 349 }, { "entropy": 1.0519947409629822, "epoch": 0.18716577540106952, "grad_norm": 0.15381091833114624, "learning_rate": 0.00018872291331200504, "loss": 1.0515, "mean_token_accuracy": 0.7423379421234131, "num_tokens": 11412483.0, "step": 350 }, { "entropy": 1.053426742553711, "epoch": 0.1877005347593583, "grad_norm": 0.11703289300203323, "learning_rate": 0.00018864720982309242, "loss": 1.0532, "mean_token_accuracy": 0.7338098883628845, "num_tokens": 11445251.0, "step": 351 }, { "entropy": 1.0194750875234604, "epoch": 0.18823529411764706, "grad_norm": 0.1410088986158371, "learning_rate": 0.00018857127018450694, "loss": 1.02, "mean_token_accuracy": 0.7453415095806122, "num_tokens": 11477668.0, "step": 352 }, { "entropy": 1.0677756518125534, "epoch": 0.18877005347593584, "grad_norm": 0.13086667656898499, "learning_rate": 0.00018849509462426844, "loss": 1.0452, "mean_token_accuracy": 0.7393670529127121, "num_tokens": 11510162.0, "step": 353 }, { "entropy": 1.0423657298088074, "epoch": 0.1893048128342246, "grad_norm": 0.12789122760295868, "learning_rate": 0.00018841868337110508, "loss": 1.0239, "mean_token_accuracy": 0.7400415390729904, "num_tokens": 11542930.0, "step": 354 }, { "entropy": 1.0276905447244644, "epoch": 0.18983957219251338, "grad_norm": 0.1383594572544098, "learning_rate": 0.00018834203665445273, "loss": 1.0199, "mean_token_accuracy": 0.7431428879499435, "num_tokens": 11575513.0, "step": 355 }, { "entropy": 1.017955631017685, "epoch": 0.19037433155080213, "grad_norm": 0.12443074584007263, "learning_rate": 0.0001882651547044543, "loss": 1.0202, "mean_token_accuracy": 0.7448633909225464, "num_tokens": 11607784.0, "step": 356 }, { "entropy": 1.047084480524063, "epoch": 0.19090909090909092, "grad_norm": 0.13764646649360657, "learning_rate": 0.00018818803775195902, "loss": 1.044, "mean_token_accuracy": 0.7388728559017181, "num_tokens": 11640511.0, "step": 357 }, { "entropy": 1.0577262043952942, "epoch": 0.19144385026737967, "grad_norm": 0.12104304879903793, "learning_rate": 0.00018811068602852182, "loss": 1.0679, "mean_token_accuracy": 0.7354218810796738, "num_tokens": 11673124.0, "step": 358 }, { "entropy": 1.0496059656143188, "epoch": 0.19197860962566846, "grad_norm": 0.1261080801486969, "learning_rate": 0.00018803309976640238, "loss": 1.066, "mean_token_accuracy": 0.7340228408575058, "num_tokens": 11705792.0, "step": 359 }, { "entropy": 1.0624986588954926, "epoch": 0.1925133689839572, "grad_norm": 0.12073995172977448, "learning_rate": 0.00018795527919856478, "loss": 1.0651, "mean_token_accuracy": 0.7365228086709976, "num_tokens": 11738238.0, "step": 360 }, { "entropy": 1.0668971538543701, "epoch": 0.193048128342246, "grad_norm": 0.12623706459999084, "learning_rate": 0.00018787722455867653, "loss": 1.0783, "mean_token_accuracy": 0.7339484840631485, "num_tokens": 11770819.0, "step": 361 }, { "entropy": 0.9814553409814835, "epoch": 0.19358288770053475, "grad_norm": 0.11412834376096725, "learning_rate": 0.0001877989360811081, "loss": 0.9742, "mean_token_accuracy": 0.7551625072956085, "num_tokens": 11803587.0, "step": 362 }, { "entropy": 1.0311852097511292, "epoch": 0.19411764705882353, "grad_norm": 0.12380577623844147, "learning_rate": 0.00018772041400093192, "loss": 1.0344, "mean_token_accuracy": 0.7427579909563065, "num_tokens": 11836073.0, "step": 363 }, { "entropy": 1.0336099565029144, "epoch": 0.1946524064171123, "grad_norm": 0.11791545897722244, "learning_rate": 0.000187641658553922, "loss": 1.0344, "mean_token_accuracy": 0.7441349029541016, "num_tokens": 11868841.0, "step": 364 }, { "entropy": 1.012706995010376, "epoch": 0.19518716577540107, "grad_norm": 0.13155929744243622, "learning_rate": 0.00018756266997655305, "loss": 1.004, "mean_token_accuracy": 0.7522120326757431, "num_tokens": 11901481.0, "step": 365 }, { "entropy": 0.9872090071439743, "epoch": 0.19572192513368983, "grad_norm": 0.12945513427257538, "learning_rate": 0.00018748344850599968, "loss": 0.9943, "mean_token_accuracy": 0.7532074749469757, "num_tokens": 11934249.0, "step": 366 }, { "entropy": 1.0130062401294708, "epoch": 0.1962566844919786, "grad_norm": 0.1238793134689331, "learning_rate": 0.0001874039943801359, "loss": 1.0095, "mean_token_accuracy": 0.7486846446990967, "num_tokens": 11966973.0, "step": 367 }, { "entropy": 1.037500947713852, "epoch": 0.19679144385026737, "grad_norm": 0.11912263929843903, "learning_rate": 0.00018732430783753424, "loss": 1.0407, "mean_token_accuracy": 0.7420326620340347, "num_tokens": 11999419.0, "step": 368 }, { "entropy": 0.9854730367660522, "epoch": 0.19732620320855615, "grad_norm": 0.12523306906223297, "learning_rate": 0.00018724438911746512, "loss": 0.9861, "mean_token_accuracy": 0.7542155385017395, "num_tokens": 12032187.0, "step": 369 }, { "entropy": 1.0310074090957642, "epoch": 0.19786096256684493, "grad_norm": 0.13841532170772552, "learning_rate": 0.0001871642384598962, "loss": 1.0353, "mean_token_accuracy": 0.7444813102483749, "num_tokens": 12064494.0, "step": 370 }, { "entropy": 1.0406521409749985, "epoch": 0.1983957219251337, "grad_norm": 0.12111449986696243, "learning_rate": 0.00018708385610549132, "loss": 1.0325, "mean_token_accuracy": 0.7387104332447052, "num_tokens": 12097191.0, "step": 371 }, { "entropy": 1.0652667582035065, "epoch": 0.19893048128342247, "grad_norm": 0.12144946306943893, "learning_rate": 0.0001870032422956103, "loss": 1.0514, "mean_token_accuracy": 0.7392802536487579, "num_tokens": 12129721.0, "step": 372 }, { "entropy": 1.081633985042572, "epoch": 0.19946524064171123, "grad_norm": 0.11997553706169128, "learning_rate": 0.00018692239727230775, "loss": 1.0706, "mean_token_accuracy": 0.7328018248081207, "num_tokens": 12162489.0, "step": 373 }, { "entropy": 1.019247144460678, "epoch": 0.2, "grad_norm": 0.12147719413042068, "learning_rate": 0.00018684132127833261, "loss": 1.0006, "mean_token_accuracy": 0.7459330856800079, "num_tokens": 12194555.0, "step": 374 }, { "entropy": 1.0349364280700684, "epoch": 0.20053475935828877, "grad_norm": 0.1279066652059555, "learning_rate": 0.00018676001455712735, "loss": 1.044, "mean_token_accuracy": 0.743842676281929, "num_tokens": 12227219.0, "step": 375 }, { "entropy": 1.0251546055078506, "epoch": 0.20106951871657755, "grad_norm": 0.12099071592092514, "learning_rate": 0.00018667847735282718, "loss": 1.0499, "mean_token_accuracy": 0.7395733743906021, "num_tokens": 12259788.0, "step": 376 }, { "entropy": 1.0475497543811798, "epoch": 0.2016042780748663, "grad_norm": 0.12799841165542603, "learning_rate": 0.0001865967099102594, "loss": 1.0465, "mean_token_accuracy": 0.7413245439529419, "num_tokens": 12292556.0, "step": 377 }, { "entropy": 1.039279356598854, "epoch": 0.2021390374331551, "grad_norm": 0.12248227000236511, "learning_rate": 0.00018651471247494271, "loss": 1.0659, "mean_token_accuracy": 0.7357649058103561, "num_tokens": 12325324.0, "step": 378 }, { "entropy": 1.0145466774702072, "epoch": 0.20267379679144384, "grad_norm": 0.12063663452863693, "learning_rate": 0.0001864324852930862, "loss": 1.0182, "mean_token_accuracy": 0.7449902296066284, "num_tokens": 12358092.0, "step": 379 }, { "entropy": 1.0798068642616272, "epoch": 0.20320855614973263, "grad_norm": 0.12489446997642517, "learning_rate": 0.00018635002861158907, "loss": 1.0682, "mean_token_accuracy": 0.7357427775859833, "num_tokens": 12390537.0, "step": 380 }, { "entropy": 1.076945185661316, "epoch": 0.20374331550802138, "grad_norm": 0.11916907131671906, "learning_rate": 0.00018626734267803937, "loss": 1.0765, "mean_token_accuracy": 0.7331989258527756, "num_tokens": 12423305.0, "step": 381 }, { "entropy": 1.0346795320510864, "epoch": 0.20427807486631017, "grad_norm": 0.1225687712430954, "learning_rate": 0.00018618442774071364, "loss": 1.0329, "mean_token_accuracy": 0.7417216598987579, "num_tokens": 12456073.0, "step": 382 }, { "entropy": 1.0055777728557587, "epoch": 0.20481283422459892, "grad_norm": 0.12688912451267242, "learning_rate": 0.00018610128404857604, "loss": 0.9866, "mean_token_accuracy": 0.750934898853302, "num_tokens": 12488140.0, "step": 383 }, { "entropy": 1.0724992752075195, "epoch": 0.2053475935828877, "grad_norm": 0.11506830900907516, "learning_rate": 0.00018601791185127766, "loss": 1.0648, "mean_token_accuracy": 0.7341200113296509, "num_tokens": 12520340.0, "step": 384 }, { "entropy": 1.0266041606664658, "epoch": 0.20588235294117646, "grad_norm": 0.12300301343202591, "learning_rate": 0.00018593431139915552, "loss": 1.0227, "mean_token_accuracy": 0.7410801649093628, "num_tokens": 12553108.0, "step": 385 }, { "entropy": 1.0123431384563446, "epoch": 0.20641711229946524, "grad_norm": 0.1202920526266098, "learning_rate": 0.0001858504829432322, "loss": 0.9988, "mean_token_accuracy": 0.7493287026882172, "num_tokens": 12585557.0, "step": 386 }, { "entropy": 1.0406652837991714, "epoch": 0.206951871657754, "grad_norm": 0.11811408400535583, "learning_rate": 0.00018576642673521482, "loss": 1.0461, "mean_token_accuracy": 0.736681342124939, "num_tokens": 12618325.0, "step": 387 }, { "entropy": 1.0288068503141403, "epoch": 0.20748663101604278, "grad_norm": 0.11991453170776367, "learning_rate": 0.00018568214302749428, "loss": 1.0299, "mean_token_accuracy": 0.7442842423915863, "num_tokens": 12650986.0, "step": 388 }, { "entropy": 1.0087891221046448, "epoch": 0.20802139037433154, "grad_norm": 0.11459293961524963, "learning_rate": 0.00018559763207314468, "loss": 1.0041, "mean_token_accuracy": 0.7455913424491882, "num_tokens": 12683661.0, "step": 389 }, { "entropy": 0.9773480743169785, "epoch": 0.20855614973262032, "grad_norm": 0.11871340870857239, "learning_rate": 0.00018551289412592248, "loss": 0.9853, "mean_token_accuracy": 0.7529376000165939, "num_tokens": 12716352.0, "step": 390 }, { "entropy": 0.9953966587781906, "epoch": 0.20909090909090908, "grad_norm": 0.1180933490395546, "learning_rate": 0.00018542792944026566, "loss": 1.0069, "mean_token_accuracy": 0.7465786933898926, "num_tokens": 12749120.0, "step": 391 }, { "entropy": 1.0300866663455963, "epoch": 0.20962566844919786, "grad_norm": 0.12579035758972168, "learning_rate": 0.00018534273827129305, "loss": 1.0327, "mean_token_accuracy": 0.740877166390419, "num_tokens": 12781794.0, "step": 392 }, { "entropy": 0.9939381182193756, "epoch": 0.21016042780748664, "grad_norm": 0.11927133053541183, "learning_rate": 0.00018525732087480346, "loss": 0.9842, "mean_token_accuracy": 0.7507331371307373, "num_tokens": 12814562.0, "step": 393 }, { "entropy": 1.0085733830928802, "epoch": 0.2106951871657754, "grad_norm": 0.12323582172393799, "learning_rate": 0.00018517167750727506, "loss": 1.0029, "mean_token_accuracy": 0.7487475574016571, "num_tokens": 12847330.0, "step": 394 }, { "entropy": 1.0676177442073822, "epoch": 0.21122994652406418, "grad_norm": 0.12364771217107773, "learning_rate": 0.0001850858084258645, "loss": 1.0592, "mean_token_accuracy": 0.7352042198181152, "num_tokens": 12879999.0, "step": 395 }, { "entropy": 1.002898469567299, "epoch": 0.21176470588235294, "grad_norm": 0.12208950519561768, "learning_rate": 0.0001849997138884062, "loss": 1.0057, "mean_token_accuracy": 0.7455271631479263, "num_tokens": 12912638.0, "step": 396 }, { "entropy": 1.0296027660369873, "epoch": 0.21229946524064172, "grad_norm": 0.12361408025026321, "learning_rate": 0.00018491339415341147, "loss": 1.0372, "mean_token_accuracy": 0.7383919954299927, "num_tokens": 12945406.0, "step": 397 }, { "entropy": 1.0023837834596634, "epoch": 0.21283422459893048, "grad_norm": 0.11596493422985077, "learning_rate": 0.00018482684948006788, "loss": 0.9916, "mean_token_accuracy": 0.748087540268898, "num_tokens": 12978152.0, "step": 398 }, { "entropy": 1.0242990553379059, "epoch": 0.21336898395721926, "grad_norm": 0.12375675141811371, "learning_rate": 0.00018474008012823835, "loss": 1.0141, "mean_token_accuracy": 0.7455989420413971, "num_tokens": 13010174.0, "step": 399 }, { "entropy": 1.0101054310798645, "epoch": 0.21390374331550802, "grad_norm": 0.13966000080108643, "learning_rate": 0.0001846530863584605, "loss": 1.0308, "mean_token_accuracy": 0.7396749705076218, "num_tokens": 13042942.0, "step": 400 }, { "entropy": 1.0114232301712036, "epoch": 0.2144385026737968, "grad_norm": 0.1396958827972412, "learning_rate": 0.0001845658684319458, "loss": 0.9996, "mean_token_accuracy": 0.7465191334486008, "num_tokens": 13075481.0, "step": 401 }, { "entropy": 0.9742856174707413, "epoch": 0.21497326203208555, "grad_norm": 0.14045993983745575, "learning_rate": 0.00018447842661057867, "loss": 0.9846, "mean_token_accuracy": 0.750946968793869, "num_tokens": 13108249.0, "step": 402 }, { "entropy": 1.0806566923856735, "epoch": 0.21550802139037434, "grad_norm": 0.13654112815856934, "learning_rate": 0.0001843907611569159, "loss": 1.0752, "mean_token_accuracy": 0.7342375367879868, "num_tokens": 13141017.0, "step": 403 }, { "entropy": 0.9732632040977478, "epoch": 0.2160427807486631, "grad_norm": 0.12816397845745087, "learning_rate": 0.0001843028723341858, "loss": 0.9799, "mean_token_accuracy": 0.7495112419128418, "num_tokens": 13173785.0, "step": 404 }, { "entropy": 1.0426502525806427, "epoch": 0.21657754010695188, "grad_norm": 0.1289195716381073, "learning_rate": 0.00018421476040628723, "loss": 1.0596, "mean_token_accuracy": 0.7384530901908875, "num_tokens": 13206553.0, "step": 405 }, { "entropy": 1.0064526945352554, "epoch": 0.21711229946524063, "grad_norm": 0.1307249516248703, "learning_rate": 0.00018412642563778918, "loss": 0.9978, "mean_token_accuracy": 0.7462714910507202, "num_tokens": 13239020.0, "step": 406 }, { "entropy": 1.0445029735565186, "epoch": 0.21764705882352942, "grad_norm": 0.135030135512352, "learning_rate": 0.0001840378682939295, "loss": 1.0335, "mean_token_accuracy": 0.7428824603557587, "num_tokens": 13271788.0, "step": 407 }, { "entropy": 1.0235690474510193, "epoch": 0.21818181818181817, "grad_norm": 0.13958315551280975, "learning_rate": 0.00018394908864061458, "loss": 1.0282, "mean_token_accuracy": 0.7400415539741516, "num_tokens": 13304556.0, "step": 408 }, { "entropy": 1.0385311245918274, "epoch": 0.21871657754010695, "grad_norm": 0.14284038543701172, "learning_rate": 0.00018386008694441815, "loss": 1.0238, "mean_token_accuracy": 0.7407746911048889, "num_tokens": 13337324.0, "step": 409 }, { "entropy": 1.0151693224906921, "epoch": 0.2192513368983957, "grad_norm": 0.1381206065416336, "learning_rate": 0.0001837708634725808, "loss": 1.0196, "mean_token_accuracy": 0.7443615198135376, "num_tokens": 13370035.0, "step": 410 }, { "entropy": 1.029556393623352, "epoch": 0.2197860962566845, "grad_norm": 0.12878087162971497, "learning_rate": 0.0001836814184930089, "loss": 1.0254, "mean_token_accuracy": 0.74590665102005, "num_tokens": 13402803.0, "step": 411 }, { "entropy": 1.0323573648929596, "epoch": 0.22032085561497325, "grad_norm": 0.12288200855255127, "learning_rate": 0.00018359175227427405, "loss": 1.0255, "mean_token_accuracy": 0.7426676005125046, "num_tokens": 13435252.0, "step": 412 }, { "entropy": 1.0320729613304138, "epoch": 0.22085561497326203, "grad_norm": 0.1420479416847229, "learning_rate": 0.00018350186508561206, "loss": 1.0396, "mean_token_accuracy": 0.7392666786909103, "num_tokens": 13467366.0, "step": 413 }, { "entropy": 1.0164061188697815, "epoch": 0.22139037433155082, "grad_norm": 0.13489247858524323, "learning_rate": 0.0001834117571969223, "loss": 1.0241, "mean_token_accuracy": 0.7414467334747314, "num_tokens": 13500134.0, "step": 414 }, { "entropy": 0.9836405515670776, "epoch": 0.22192513368983957, "grad_norm": 0.12505358457565308, "learning_rate": 0.0001833214288787668, "loss": 0.9955, "mean_token_accuracy": 0.7502505928277969, "num_tokens": 13532484.0, "step": 415 }, { "entropy": 1.0373257994651794, "epoch": 0.22245989304812835, "grad_norm": 0.13148537278175354, "learning_rate": 0.0001832308804023694, "loss": 1.0423, "mean_token_accuracy": 0.74172542989254, "num_tokens": 13565187.0, "step": 416 }, { "entropy": 1.025197684764862, "epoch": 0.2229946524064171, "grad_norm": 0.1361037790775299, "learning_rate": 0.00018314011203961515, "loss": 1.0128, "mean_token_accuracy": 0.7460659295320511, "num_tokens": 13597697.0, "step": 417 }, { "entropy": 1.0798514187335968, "epoch": 0.2235294117647059, "grad_norm": 0.12265218794345856, "learning_rate": 0.00018304912406304916, "loss": 1.0693, "mean_token_accuracy": 0.7381899803876877, "num_tokens": 13630215.0, "step": 418 }, { "entropy": 1.101951003074646, "epoch": 0.22406417112299465, "grad_norm": 0.12851333618164062, "learning_rate": 0.00018295791674587605, "loss": 1.1005, "mean_token_accuracy": 0.7265847474336624, "num_tokens": 13662970.0, "step": 419 }, { "entropy": 1.0959776639938354, "epoch": 0.22459893048128343, "grad_norm": 0.14182980358600616, "learning_rate": 0.0001828664903619591, "loss": 1.0953, "mean_token_accuracy": 0.7329133898019791, "num_tokens": 13695397.0, "step": 420 }, { "entropy": 1.0631342232227325, "epoch": 0.2251336898395722, "grad_norm": 0.12244658917188644, "learning_rate": 0.00018277484518581928, "loss": 1.0564, "mean_token_accuracy": 0.7375366538763046, "num_tokens": 13728165.0, "step": 421 }, { "entropy": 1.0424781888723373, "epoch": 0.22566844919786097, "grad_norm": 0.1308673918247223, "learning_rate": 0.00018268298149263455, "loss": 1.0384, "mean_token_accuracy": 0.7420882135629654, "num_tokens": 13760933.0, "step": 422 }, { "entropy": 1.0469609051942825, "epoch": 0.22620320855614973, "grad_norm": 0.14152593910694122, "learning_rate": 0.00018259089955823899, "loss": 1.0395, "mean_token_accuracy": 0.7439210712909698, "num_tokens": 13793701.0, "step": 423 }, { "entropy": 1.0544581413269043, "epoch": 0.2267379679144385, "grad_norm": 0.12065031379461288, "learning_rate": 0.00018249859965912196, "loss": 1.0531, "mean_token_accuracy": 0.7355510890483856, "num_tokens": 13826469.0, "step": 424 }, { "entropy": 1.080516904592514, "epoch": 0.22727272727272727, "grad_norm": 0.1398177295923233, "learning_rate": 0.00018240608207242736, "loss": 1.0895, "mean_token_accuracy": 0.7328049540519714, "num_tokens": 13859107.0, "step": 425 }, { "entropy": 1.0148239135742188, "epoch": 0.22780748663101605, "grad_norm": 0.1272018402814865, "learning_rate": 0.00018231334707595267, "loss": 1.0362, "mean_token_accuracy": 0.7434576749801636, "num_tokens": 13891780.0, "step": 426 }, { "entropy": 1.0019170939922333, "epoch": 0.2283422459893048, "grad_norm": 0.12834444642066956, "learning_rate": 0.00018222039494814813, "loss": 1.0023, "mean_token_accuracy": 0.7483878880739212, "num_tokens": 13924267.0, "step": 427 }, { "entropy": 1.0581309497356415, "epoch": 0.2288770053475936, "grad_norm": 0.12580765783786774, "learning_rate": 0.0001821272259681161, "loss": 1.0591, "mean_token_accuracy": 0.7372151464223862, "num_tokens": 13956221.0, "step": 428 }, { "entropy": 1.0421131998300552, "epoch": 0.22941176470588234, "grad_norm": 0.12588627636432648, "learning_rate": 0.00018203384041560994, "loss": 1.042, "mean_token_accuracy": 0.7395138293504715, "num_tokens": 13988988.0, "step": 429 }, { "entropy": 1.053133800625801, "epoch": 0.22994652406417113, "grad_norm": 0.13137686252593994, "learning_rate": 0.00018194023857103333, "loss": 1.0433, "mean_token_accuracy": 0.7409885227680206, "num_tokens": 14021756.0, "step": 430 }, { "entropy": 1.0503493249416351, "epoch": 0.23048128342245988, "grad_norm": 0.11900833249092102, "learning_rate": 0.0001818464207154394, "loss": 1.041, "mean_token_accuracy": 0.7390946745872498, "num_tokens": 14054281.0, "step": 431 }, { "entropy": 1.018021285533905, "epoch": 0.23101604278074866, "grad_norm": 0.1321375072002411, "learning_rate": 0.00018175238713052992, "loss": 1.016, "mean_token_accuracy": 0.7537267804145813, "num_tokens": 14087049.0, "step": 432 }, { "entropy": 1.079847276210785, "epoch": 0.23155080213903742, "grad_norm": 0.12485760450363159, "learning_rate": 0.00018165813809865435, "loss": 1.0716, "mean_token_accuracy": 0.7340243756771088, "num_tokens": 14119639.0, "step": 433 }, { "entropy": 1.0371785908937454, "epoch": 0.2320855614973262, "grad_norm": 0.12414313852787018, "learning_rate": 0.00018156367390280914, "loss": 1.0331, "mean_token_accuracy": 0.7424853444099426, "num_tokens": 14152407.0, "step": 434 }, { "entropy": 1.0535239577293396, "epoch": 0.232620320855615, "grad_norm": 0.13701845705509186, "learning_rate": 0.00018146899482663674, "loss": 1.0642, "mean_token_accuracy": 0.7345901429653168, "num_tokens": 14184691.0, "step": 435 }, { "entropy": 1.014063835144043, "epoch": 0.23315508021390374, "grad_norm": 0.12825563549995422, "learning_rate": 0.0001813741011544248, "loss": 1.0217, "mean_token_accuracy": 0.7455740571022034, "num_tokens": 14217452.0, "step": 436 }, { "entropy": 1.0205018222332, "epoch": 0.23368983957219253, "grad_norm": 0.13586899638175964, "learning_rate": 0.0001812789931711054, "loss": 1.0306, "mean_token_accuracy": 0.7433712184429169, "num_tokens": 14250220.0, "step": 437 }, { "entropy": 0.995821937918663, "epoch": 0.23422459893048128, "grad_norm": 0.12893128395080566, "learning_rate": 0.0001811836711622541, "loss": 0.9992, "mean_token_accuracy": 0.7476478517055511, "num_tokens": 14282988.0, "step": 438 }, { "entropy": 0.996499165892601, "epoch": 0.23475935828877006, "grad_norm": 0.12627406418323517, "learning_rate": 0.00018108813541408905, "loss": 0.9886, "mean_token_accuracy": 0.7468492090702057, "num_tokens": 14315721.0, "step": 439 }, { "entropy": 1.0268193781375885, "epoch": 0.23529411764705882, "grad_norm": 0.12785205245018005, "learning_rate": 0.00018099238621347013, "loss": 1.0178, "mean_token_accuracy": 0.7465786933898926, "num_tokens": 14348489.0, "step": 440 }, { "entropy": 1.0284498482942581, "epoch": 0.2358288770053476, "grad_norm": 0.12388309091329575, "learning_rate": 0.00018089642384789837, "loss": 1.0158, "mean_token_accuracy": 0.7436087131500244, "num_tokens": 14381033.0, "step": 441 }, { "entropy": 1.014558106660843, "epoch": 0.23636363636363636, "grad_norm": 0.13476227223873138, "learning_rate": 0.00018080024860551462, "loss": 1.0128, "mean_token_accuracy": 0.7464248687028885, "num_tokens": 14413724.0, "step": 442 }, { "entropy": 1.0255836695432663, "epoch": 0.23689839572192514, "grad_norm": 0.1396826058626175, "learning_rate": 0.00018070386077509906, "loss": 1.0065, "mean_token_accuracy": 0.7451001703739166, "num_tokens": 14446293.0, "step": 443 }, { "entropy": 1.0523072481155396, "epoch": 0.2374331550802139, "grad_norm": 0.13807937502861023, "learning_rate": 0.00018060726064607013, "loss": 1.0719, "mean_token_accuracy": 0.7356121689081192, "num_tokens": 14479061.0, "step": 444 }, { "entropy": 1.0252759754657745, "epoch": 0.23796791443850268, "grad_norm": 0.12841564416885376, "learning_rate": 0.00018051044850848376, "loss": 1.043, "mean_token_accuracy": 0.7401412576436996, "num_tokens": 14511633.0, "step": 445 }, { "entropy": 0.9956532716751099, "epoch": 0.23850267379679144, "grad_norm": 0.1594788134098053, "learning_rate": 0.00018041342465303245, "loss": 1.0014, "mean_token_accuracy": 0.7484622746706009, "num_tokens": 14544241.0, "step": 446 }, { "entropy": 1.054281085729599, "epoch": 0.23903743315508022, "grad_norm": 0.13689427077770233, "learning_rate": 0.00018031618937104443, "loss": 1.0645, "mean_token_accuracy": 0.7383919954299927, "num_tokens": 14577009.0, "step": 447 }, { "entropy": 1.0149857997894287, "epoch": 0.23957219251336898, "grad_norm": 0.12950752675533295, "learning_rate": 0.00018021874295448274, "loss": 1.0095, "mean_token_accuracy": 0.7468536198139191, "num_tokens": 14609777.0, "step": 448 }, { "entropy": 1.0120531618595123, "epoch": 0.24010695187165776, "grad_norm": 0.14452537894248962, "learning_rate": 0.00018012108569594442, "loss": 1.0125, "mean_token_accuracy": 0.7444403767585754, "num_tokens": 14642545.0, "step": 449 }, { "entropy": 1.069253385066986, "epoch": 0.24064171122994651, "grad_norm": 0.12331288307905197, "learning_rate": 0.00018002321788865956, "loss": 1.0612, "mean_token_accuracy": 0.7359224706888199, "num_tokens": 14674929.0, "step": 450 }, { "entropy": 1.0392256081104279, "epoch": 0.2411764705882353, "grad_norm": 0.1363597810268402, "learning_rate": 0.0001799251398264905, "loss": 1.0422, "mean_token_accuracy": 0.7407746911048889, "num_tokens": 14707697.0, "step": 451 }, { "entropy": 1.0564654767513275, "epoch": 0.24171122994652405, "grad_norm": 0.12315265089273453, "learning_rate": 0.00017982685180393084, "loss": 1.0578, "mean_token_accuracy": 0.7412023544311523, "num_tokens": 14740465.0, "step": 452 }, { "entropy": 0.997405618429184, "epoch": 0.24224598930481284, "grad_norm": 0.1231595054268837, "learning_rate": 0.00017972835411610467, "loss": 0.9876, "mean_token_accuracy": 0.7521994113922119, "num_tokens": 14773233.0, "step": 453 }, { "entropy": 1.0320776253938675, "epoch": 0.2427807486631016, "grad_norm": 0.13027168810367584, "learning_rate": 0.00017962964705876562, "loss": 1.0324, "mean_token_accuracy": 0.7407857179641724, "num_tokens": 14805547.0, "step": 454 }, { "entropy": 1.0264951586723328, "epoch": 0.24331550802139038, "grad_norm": 0.123665951192379, "learning_rate": 0.0001795307309282959, "loss": 1.0368, "mean_token_accuracy": 0.740406945347786, "num_tokens": 14838168.0, "step": 455 }, { "entropy": 1.039396345615387, "epoch": 0.24385026737967913, "grad_norm": 0.1296997368335724, "learning_rate": 0.0001794316060217056, "loss": 1.0429, "mean_token_accuracy": 0.7405395954847336, "num_tokens": 14870827.0, "step": 456 }, { "entropy": 0.9936301857233047, "epoch": 0.24438502673796791, "grad_norm": 0.12982916831970215, "learning_rate": 0.00017933227263663168, "loss": 0.9777, "mean_token_accuracy": 0.7492057681083679, "num_tokens": 14903595.0, "step": 457 }, { "entropy": 1.0279450565576553, "epoch": 0.2449197860962567, "grad_norm": 0.13376674056053162, "learning_rate": 0.00017923273107133702, "loss": 1.0339, "mean_token_accuracy": 0.7414161860942841, "num_tokens": 14936363.0, "step": 458 }, { "entropy": 1.0354578495025635, "epoch": 0.24545454545454545, "grad_norm": 0.12655210494995117, "learning_rate": 0.0001791329816247097, "loss": 1.0215, "mean_token_accuracy": 0.7418133020401001, "num_tokens": 14969131.0, "step": 459 }, { "entropy": 1.0363547652959824, "epoch": 0.24598930481283424, "grad_norm": 0.12433706969022751, "learning_rate": 0.00017903302459626181, "loss": 1.0429, "mean_token_accuracy": 0.7416990548372269, "num_tokens": 15001726.0, "step": 460 }, { "entropy": 1.0166066586971283, "epoch": 0.246524064171123, "grad_norm": 0.13178230822086334, "learning_rate": 0.00017893286028612893, "loss": 1.0154, "mean_token_accuracy": 0.7441807091236115, "num_tokens": 15034238.0, "step": 461 }, { "entropy": 1.0380278527736664, "epoch": 0.24705882352941178, "grad_norm": 0.12242249399423599, "learning_rate": 0.00017883248899506894, "loss": 1.0187, "mean_token_accuracy": 0.7434017658233643, "num_tokens": 15067006.0, "step": 462 }, { "entropy": 1.0094977915287018, "epoch": 0.24759358288770053, "grad_norm": 0.1288090944290161, "learning_rate": 0.00017873191102446123, "loss": 1.0005, "mean_token_accuracy": 0.7474645674228668, "num_tokens": 15099774.0, "step": 463 }, { "entropy": 1.0057529509067535, "epoch": 0.24812834224598931, "grad_norm": 0.127349853515625, "learning_rate": 0.0001786311266763057, "loss": 1.017, "mean_token_accuracy": 0.7436461448669434, "num_tokens": 15132542.0, "step": 464 }, { "entropy": 1.0217185318470001, "epoch": 0.24866310160427807, "grad_norm": 0.12888741493225098, "learning_rate": 0.0001785301362532221, "loss": 1.0314, "mean_token_accuracy": 0.7411412596702576, "num_tokens": 15165310.0, "step": 465 }, { "entropy": 1.0755583345890045, "epoch": 0.24919786096256685, "grad_norm": 0.12444429099559784, "learning_rate": 0.00017842894005844876, "loss": 1.0886, "mean_token_accuracy": 0.7273490577936172, "num_tokens": 15197781.0, "step": 466 }, { "entropy": 0.9956681579351425, "epoch": 0.2497326203208556, "grad_norm": 0.13689233362674713, "learning_rate": 0.00017832753839584206, "loss": 1.0204, "mean_token_accuracy": 0.7466397881507874, "num_tokens": 15230549.0, "step": 467 }, { "entropy": 1.092822641134262, "epoch": 0.25026737967914436, "grad_norm": 0.128508061170578, "learning_rate": 0.0001782259315698751, "loss": 1.0894, "mean_token_accuracy": 0.7267534136772156, "num_tokens": 15263317.0, "step": 468 }, { "entropy": 1.0412489771842957, "epoch": 0.25080213903743315, "grad_norm": 0.12759077548980713, "learning_rate": 0.0001781241198856372, "loss": 1.0309, "mean_token_accuracy": 0.7388196587562561, "num_tokens": 15296085.0, "step": 469 }, { "entropy": 1.0570957958698273, "epoch": 0.25133689839572193, "grad_norm": 0.13412855565547943, "learning_rate": 0.00017802210364883272, "loss": 1.0521, "mean_token_accuracy": 0.7335044145584106, "num_tokens": 15328853.0, "step": 470 }, { "entropy": 1.072775959968567, "epoch": 0.2518716577540107, "grad_norm": 0.12720152735710144, "learning_rate": 0.00017791988316578024, "loss": 1.0691, "mean_token_accuracy": 0.7365253865718842, "num_tokens": 15360830.0, "step": 471 }, { "entropy": 1.0713469088077545, "epoch": 0.25240641711229944, "grad_norm": 0.16397106647491455, "learning_rate": 0.00017781745874341162, "loss": 1.0732, "mean_token_accuracy": 0.7350176274776459, "num_tokens": 15393480.0, "step": 472 }, { "entropy": 1.0052785575389862, "epoch": 0.2529411764705882, "grad_norm": 0.14997598528862, "learning_rate": 0.00017771483068927105, "loss": 1.009, "mean_token_accuracy": 0.7479838728904724, "num_tokens": 15426248.0, "step": 473 }, { "entropy": 1.0340908020734787, "epoch": 0.253475935828877, "grad_norm": 0.13000135123729706, "learning_rate": 0.00017761199931151414, "loss": 1.0149, "mean_token_accuracy": 0.7482282519340515, "num_tokens": 15459016.0, "step": 474 }, { "entropy": 1.0392605364322662, "epoch": 0.2540106951871658, "grad_norm": 0.12135802954435349, "learning_rate": 0.00017750896491890711, "loss": 1.0381, "mean_token_accuracy": 0.7424401193857193, "num_tokens": 15491690.0, "step": 475 }, { "entropy": 0.9842015951871872, "epoch": 0.2545454545454545, "grad_norm": 0.1403506100177765, "learning_rate": 0.00017740572782082566, "loss": 0.984, "mean_token_accuracy": 0.7504887580871582, "num_tokens": 15524458.0, "step": 476 }, { "entropy": 1.0196618139743805, "epoch": 0.2550802139037433, "grad_norm": 0.13045115768909454, "learning_rate": 0.00017730228832725414, "loss": 1.0212, "mean_token_accuracy": 0.7442428767681122, "num_tokens": 15557040.0, "step": 477 }, { "entropy": 1.0410612225532532, "epoch": 0.2556149732620321, "grad_norm": 0.12354070693254471, "learning_rate": 0.00017719864674878473, "loss": 1.0511, "mean_token_accuracy": 0.7382087111473083, "num_tokens": 15589808.0, "step": 478 }, { "entropy": 1.0096575170755386, "epoch": 0.25614973262032087, "grad_norm": 0.12986738979816437, "learning_rate": 0.00017709480339661625, "loss": 1.0118, "mean_token_accuracy": 0.7461607456207275, "num_tokens": 15622509.0, "step": 479 }, { "entropy": 1.0030359774827957, "epoch": 0.25668449197860965, "grad_norm": 0.13224437832832336, "learning_rate": 0.00017699075858255351, "loss": 1.0026, "mean_token_accuracy": 0.7482029497623444, "num_tokens": 15654978.0, "step": 480 }, { "entropy": 1.0419865548610687, "epoch": 0.2572192513368984, "grad_norm": 0.12151559442281723, "learning_rate": 0.00017688651261900615, "loss": 1.0244, "mean_token_accuracy": 0.7438905239105225, "num_tokens": 15687746.0, "step": 481 }, { "entropy": 0.993943452835083, "epoch": 0.25775401069518716, "grad_norm": 0.12632091343402863, "learning_rate": 0.0001767820658189878, "loss": 0.988, "mean_token_accuracy": 0.7510421872138977, "num_tokens": 15720109.0, "step": 482 }, { "entropy": 1.048466607928276, "epoch": 0.25828877005347595, "grad_norm": 0.12814119458198547, "learning_rate": 0.00017667741849611518, "loss": 1.0317, "mean_token_accuracy": 0.742913007736206, "num_tokens": 15752877.0, "step": 483 }, { "entropy": 0.9965784847736359, "epoch": 0.25882352941176473, "grad_norm": 0.1250571310520172, "learning_rate": 0.00017657257096460709, "loss": 0.9938, "mean_token_accuracy": 0.7524084597826004, "num_tokens": 15785581.0, "step": 484 }, { "entropy": 1.0023357719182968, "epoch": 0.25935828877005346, "grad_norm": 0.14221987128257751, "learning_rate": 0.00017646752353928343, "loss": 0.9945, "mean_token_accuracy": 0.7445931136608124, "num_tokens": 15818349.0, "step": 485 }, { "entropy": 1.0235685110092163, "epoch": 0.25989304812834224, "grad_norm": 0.12448383867740631, "learning_rate": 0.00017636227653556444, "loss": 1.0278, "mean_token_accuracy": 0.7433406561613083, "num_tokens": 15851117.0, "step": 486 }, { "entropy": 1.0168971568346024, "epoch": 0.260427807486631, "grad_norm": 0.14768727123737335, "learning_rate": 0.00017625683026946943, "loss": 1.0336, "mean_token_accuracy": 0.7419965863227844, "num_tokens": 15883885.0, "step": 487 }, { "entropy": 1.0200735032558441, "epoch": 0.2609625668449198, "grad_norm": 0.1485704928636551, "learning_rate": 0.0001761511850576162, "loss": 1.0421, "mean_token_accuracy": 0.7370060086250305, "num_tokens": 15916589.0, "step": 488 }, { "entropy": 1.0495162308216095, "epoch": 0.26149732620320854, "grad_norm": 0.12823961675167084, "learning_rate": 0.00017604534121721982, "loss": 1.0479, "mean_token_accuracy": 0.735199049115181, "num_tokens": 15949249.0, "step": 489 }, { "entropy": 1.003669187426567, "epoch": 0.2620320855614973, "grad_norm": 0.1225598081946373, "learning_rate": 0.00017593929906609183, "loss": 0.9972, "mean_token_accuracy": 0.751557931303978, "num_tokens": 15982017.0, "step": 490 }, { "entropy": 1.0210746675729752, "epoch": 0.2625668449197861, "grad_norm": 0.14348438382148743, "learning_rate": 0.00017583305892263912, "loss": 1.0116, "mean_token_accuracy": 0.7477700412273407, "num_tokens": 16014785.0, "step": 491 }, { "entropy": 1.0653307139873505, "epoch": 0.2631016042780749, "grad_norm": 0.1344357132911682, "learning_rate": 0.0001757266211058633, "loss": 1.0628, "mean_token_accuracy": 0.7340181171894073, "num_tokens": 16047491.0, "step": 492 }, { "entropy": 1.0467574447393417, "epoch": 0.2636363636363636, "grad_norm": 0.12318356335163116, "learning_rate": 0.0001756199859353592, "loss": 1.0468, "mean_token_accuracy": 0.7361839115619659, "num_tokens": 16079723.0, "step": 493 }, { "entropy": 1.0086955726146698, "epoch": 0.2641711229946524, "grad_norm": 0.1444452404975891, "learning_rate": 0.0001755131537313145, "loss": 1.0057, "mean_token_accuracy": 0.7454583793878555, "num_tokens": 16112352.0, "step": 494 }, { "entropy": 1.0184801816940308, "epoch": 0.2647058823529412, "grad_norm": 0.15575626492500305, "learning_rate": 0.0001754061248145084, "loss": 1.0077, "mean_token_accuracy": 0.7482426911592484, "num_tokens": 16145107.0, "step": 495 }, { "entropy": 0.9972895234823227, "epoch": 0.26524064171122996, "grad_norm": 0.14014114439487457, "learning_rate": 0.00017529889950631083, "loss": 1.0077, "mean_token_accuracy": 0.7456616163253784, "num_tokens": 16177485.0, "step": 496 }, { "entropy": 0.9907697141170502, "epoch": 0.2657754010695187, "grad_norm": 0.12591008841991425, "learning_rate": 0.00017519147812868128, "loss": 1.002, "mean_token_accuracy": 0.7483922243118286, "num_tokens": 16209917.0, "step": 497 }, { "entropy": 1.0470885783433914, "epoch": 0.2663101604278075, "grad_norm": 0.1289779245853424, "learning_rate": 0.00017508386100416806, "loss": 1.0299, "mean_token_accuracy": 0.7467584758996964, "num_tokens": 16242673.0, "step": 498 }, { "entropy": 1.0440703928470612, "epoch": 0.26684491978609626, "grad_norm": 0.1313711255788803, "learning_rate": 0.0001749760484559072, "loss": 1.036, "mean_token_accuracy": 0.7399499118328094, "num_tokens": 16275441.0, "step": 499 }, { "entropy": 1.0163945257663727, "epoch": 0.26737967914438504, "grad_norm": 0.1350429356098175, "learning_rate": 0.0001748680408076216, "loss": 1.0155, "mean_token_accuracy": 0.7431573867797852, "num_tokens": 16308209.0, "step": 500 }, { "entropy": 1.0228415131568909, "epoch": 0.2679144385026738, "grad_norm": 0.1291084736585617, "learning_rate": 0.00017475983838361985, "loss": 1.0213, "mean_token_accuracy": 0.7467547506093979, "num_tokens": 16340628.0, "step": 501 }, { "entropy": 1.0208053439855576, "epoch": 0.26844919786096255, "grad_norm": 0.13432392477989197, "learning_rate": 0.00017465144150879548, "loss": 1.0301, "mean_token_accuracy": 0.7436855286359787, "num_tokens": 16373004.0, "step": 502 }, { "entropy": 1.0301097184419632, "epoch": 0.26898395721925134, "grad_norm": 0.12891647219657898, "learning_rate": 0.00017454285050862587, "loss": 1.0238, "mean_token_accuracy": 0.7430192977190018, "num_tokens": 16405632.0, "step": 503 }, { "entropy": 1.0221526473760605, "epoch": 0.2695187165775401, "grad_norm": 0.12521539628505707, "learning_rate": 0.00017443406570917124, "loss": 1.0217, "mean_token_accuracy": 0.7419375777244568, "num_tokens": 16438246.0, "step": 504 }, { "entropy": 1.0305202156305313, "epoch": 0.2700534759358289, "grad_norm": 0.13040553033351898, "learning_rate": 0.00017432508743707374, "loss": 1.0377, "mean_token_accuracy": 0.7410676926374435, "num_tokens": 16470508.0, "step": 505 }, { "entropy": 0.9664936810731888, "epoch": 0.27058823529411763, "grad_norm": 0.14031217992305756, "learning_rate": 0.0001742159160195565, "loss": 0.9722, "mean_token_accuracy": 0.7550433725118637, "num_tokens": 16503037.0, "step": 506 }, { "entropy": 0.9803265780210495, "epoch": 0.2711229946524064, "grad_norm": 0.1306711584329605, "learning_rate": 0.00017410655178442259, "loss": 0.9881, "mean_token_accuracy": 0.747403472661972, "num_tokens": 16535805.0, "step": 507 }, { "entropy": 0.9995748400688171, "epoch": 0.2716577540106952, "grad_norm": 0.13603125512599945, "learning_rate": 0.00017399699506005393, "loss": 1.0104, "mean_token_accuracy": 0.7487271130084991, "num_tokens": 16567934.0, "step": 508 }, { "entropy": 1.0516096651554108, "epoch": 0.272192513368984, "grad_norm": 0.13807128369808197, "learning_rate": 0.00017388724617541058, "loss": 1.0551, "mean_token_accuracy": 0.7384747862815857, "num_tokens": 16600228.0, "step": 509 }, { "entropy": 1.0359112918376923, "epoch": 0.2727272727272727, "grad_norm": 0.1329445093870163, "learning_rate": 0.00017377730546002944, "loss": 1.0464, "mean_token_accuracy": 0.7396444380283356, "num_tokens": 16632996.0, "step": 510 }, { "entropy": 1.0623237937688828, "epoch": 0.2732620320855615, "grad_norm": 0.12603521347045898, "learning_rate": 0.00017366717324402353, "loss": 1.0525, "mean_token_accuracy": 0.7370479106903076, "num_tokens": 16665764.0, "step": 511 }, { "entropy": 1.060817837715149, "epoch": 0.2737967914438503, "grad_norm": 0.13725413382053375, "learning_rate": 0.00017355684985808078, "loss": 1.0642, "mean_token_accuracy": 0.7357343584299088, "num_tokens": 16698532.0, "step": 512 }, { "entropy": 1.0357243567705154, "epoch": 0.27433155080213906, "grad_norm": 0.15183106064796448, "learning_rate": 0.00017344633563346325, "loss": 1.0332, "mean_token_accuracy": 0.7371090054512024, "num_tokens": 16731300.0, "step": 513 }, { "entropy": 0.9966153800487518, "epoch": 0.2748663101604278, "grad_norm": 0.13324125111103058, "learning_rate": 0.0001733356309020059, "loss": 0.9744, "mean_token_accuracy": 0.7511016726493835, "num_tokens": 16763882.0, "step": 514 }, { "entropy": 1.0532770156860352, "epoch": 0.27540106951871657, "grad_norm": 0.12529823184013367, "learning_rate": 0.00017322473599611579, "loss": 1.0507, "mean_token_accuracy": 0.7382698059082031, "num_tokens": 16796650.0, "step": 515 }, { "entropy": 1.0636248290538788, "epoch": 0.27593582887700535, "grad_norm": 0.13705985248088837, "learning_rate": 0.000173113651248771, "loss": 1.0659, "mean_token_accuracy": 0.738916352391243, "num_tokens": 16829259.0, "step": 516 }, { "entropy": 1.0040618181228638, "epoch": 0.27647058823529413, "grad_norm": 0.1455538123846054, "learning_rate": 0.00017300237699351958, "loss": 0.992, "mean_token_accuracy": 0.7514193654060364, "num_tokens": 16861664.0, "step": 517 }, { "entropy": 1.0494634211063385, "epoch": 0.27700534759358286, "grad_norm": 0.13522738218307495, "learning_rate": 0.00017289091356447868, "loss": 1.0542, "mean_token_accuracy": 0.7430696189403534, "num_tokens": 16894217.0, "step": 518 }, { "entropy": 1.0269292891025543, "epoch": 0.27754010695187165, "grad_norm": 0.1228233054280281, "learning_rate": 0.00017277926129633341, "loss": 1.0023, "mean_token_accuracy": 0.7457484006881714, "num_tokens": 16926573.0, "step": 519 }, { "entropy": 1.0406086146831512, "epoch": 0.27807486631016043, "grad_norm": 0.14273546636104584, "learning_rate": 0.000172667420524336, "loss": 1.0352, "mean_token_accuracy": 0.7418330907821655, "num_tokens": 16959322.0, "step": 520 }, { "entropy": 1.0543610453605652, "epoch": 0.2786096256684492, "grad_norm": 0.17359653115272522, "learning_rate": 0.00017255539158430452, "loss": 1.0485, "mean_token_accuracy": 0.7373014837503433, "num_tokens": 16991838.0, "step": 521 }, { "entropy": 1.009380429983139, "epoch": 0.279144385026738, "grad_norm": 0.12541234493255615, "learning_rate": 0.00017244317481262218, "loss": 1.007, "mean_token_accuracy": 0.7480654567480087, "num_tokens": 17024409.0, "step": 522 }, { "entropy": 1.0351165235042572, "epoch": 0.2796791443850267, "grad_norm": 0.1301652491092682, "learning_rate": 0.0001723307705462361, "loss": 1.0375, "mean_token_accuracy": 0.739235907793045, "num_tokens": 17056854.0, "step": 523 }, { "entropy": 0.9839812368154526, "epoch": 0.2802139037433155, "grad_norm": 0.13403740525245667, "learning_rate": 0.0001722181791226565, "loss": 1.0012, "mean_token_accuracy": 0.7473640590906143, "num_tokens": 17089577.0, "step": 524 }, { "entropy": 1.013493835926056, "epoch": 0.2807486631016043, "grad_norm": 0.13362367451190948, "learning_rate": 0.00017210540087995546, "loss": 1.0236, "mean_token_accuracy": 0.7451429665088654, "num_tokens": 17122345.0, "step": 525 }, { "entropy": 1.0553540736436844, "epoch": 0.2812834224598931, "grad_norm": 0.12962518632411957, "learning_rate": 0.00017199243615676597, "loss": 1.0545, "mean_token_accuracy": 0.7424853444099426, "num_tokens": 17155113.0, "step": 526 }, { "entropy": 1.0302038192749023, "epoch": 0.2818181818181818, "grad_norm": 0.13777880370616913, "learning_rate": 0.0001718792852922811, "loss": 1.0216, "mean_token_accuracy": 0.7463139593601227, "num_tokens": 17187808.0, "step": 527 }, { "entropy": 1.0302510261535645, "epoch": 0.2823529411764706, "grad_norm": 0.13522762060165405, "learning_rate": 0.00017176594862625274, "loss": 1.0233, "mean_token_accuracy": 0.7411107122898102, "num_tokens": 17220576.0, "step": 528 }, { "entropy": 1.0147441774606705, "epoch": 0.28288770053475937, "grad_norm": 0.1387215256690979, "learning_rate": 0.00017165242649899061, "loss": 1.0277, "mean_token_accuracy": 0.7405515313148499, "num_tokens": 17253033.0, "step": 529 }, { "entropy": 1.0255239754915237, "epoch": 0.28342245989304815, "grad_norm": 0.13352422416210175, "learning_rate": 0.00017153871925136154, "loss": 1.0302, "mean_token_accuracy": 0.7430898249149323, "num_tokens": 17285505.0, "step": 530 }, { "entropy": 1.0094399154186249, "epoch": 0.2839572192513369, "grad_norm": 0.12503620982170105, "learning_rate": 0.00017142482722478795, "loss": 0.9919, "mean_token_accuracy": 0.7527617067098618, "num_tokens": 17318210.0, "step": 531 }, { "entropy": 1.0132300108671188, "epoch": 0.28449197860962566, "grad_norm": 0.12262210994958878, "learning_rate": 0.00017131075076124724, "loss": 1.0109, "mean_token_accuracy": 0.7407647520303726, "num_tokens": 17350844.0, "step": 532 }, { "entropy": 0.9696976691484451, "epoch": 0.28502673796791445, "grad_norm": 0.12632109224796295, "learning_rate": 0.00017119649020327053, "loss": 0.9535, "mean_token_accuracy": 0.7574230134487152, "num_tokens": 17383612.0, "step": 533 }, { "entropy": 0.989849328994751, "epoch": 0.28556149732620323, "grad_norm": 0.12456396967172623, "learning_rate": 0.00017108204589394178, "loss": 0.9767, "mean_token_accuracy": 0.7528417408466339, "num_tokens": 17416080.0, "step": 534 }, { "entropy": 1.013772889971733, "epoch": 0.28609625668449196, "grad_norm": 0.12125220149755478, "learning_rate": 0.00017096741817689663, "loss": 1.0128, "mean_token_accuracy": 0.7480128556489944, "num_tokens": 17448632.0, "step": 535 }, { "entropy": 0.9724691212177277, "epoch": 0.28663101604278074, "grad_norm": 0.13034920394420624, "learning_rate": 0.00017085260739632148, "loss": 0.9904, "mean_token_accuracy": 0.7504276633262634, "num_tokens": 17481400.0, "step": 536 }, { "entropy": 0.9728181064128876, "epoch": 0.2871657754010695, "grad_norm": 0.12571309506893158, "learning_rate": 0.00017073761389695233, "loss": 0.9688, "mean_token_accuracy": 0.7547041773796082, "num_tokens": 17514069.0, "step": 537 }, { "entropy": 1.0064800530672073, "epoch": 0.2877005347593583, "grad_norm": 0.1352233588695526, "learning_rate": 0.00017062243802407393, "loss": 1.0153, "mean_token_accuracy": 0.7431587725877762, "num_tokens": 17546810.0, "step": 538 }, { "entropy": 0.9872759431600571, "epoch": 0.28823529411764703, "grad_norm": 0.13819102942943573, "learning_rate": 0.00017050708012351852, "loss": 1.0031, "mean_token_accuracy": 0.7485620528459549, "num_tokens": 17579265.0, "step": 539 }, { "entropy": 1.0200807750225067, "epoch": 0.2887700534759358, "grad_norm": 0.13746803998947144, "learning_rate": 0.000170391540541665, "loss": 1.0361, "mean_token_accuracy": 0.7412613034248352, "num_tokens": 17611772.0, "step": 540 }, { "entropy": 1.0252193808555603, "epoch": 0.2893048128342246, "grad_norm": 0.12763537466526031, "learning_rate": 0.00017027581962543768, "loss": 1.0328, "mean_token_accuracy": 0.742118775844574, "num_tokens": 17644540.0, "step": 541 }, { "entropy": 1.0053461492061615, "epoch": 0.2898395721925134, "grad_norm": 0.12436962127685547, "learning_rate": 0.00017015991772230545, "loss": 0.9979, "mean_token_accuracy": 0.742668628692627, "num_tokens": 17677308.0, "step": 542 }, { "entropy": 1.030677169561386, "epoch": 0.2903743315508021, "grad_norm": 0.13748657703399658, "learning_rate": 0.00017004383518028069, "loss": 1.0105, "mean_token_accuracy": 0.7427105903625488, "num_tokens": 17710034.0, "step": 543 }, { "entropy": 1.0635046660900116, "epoch": 0.2909090909090909, "grad_norm": 0.12991686165332794, "learning_rate": 0.000169927572347918, "loss": 1.0535, "mean_token_accuracy": 0.7350575774908066, "num_tokens": 17742638.0, "step": 544 }, { "entropy": 0.9904083460569382, "epoch": 0.2914438502673797, "grad_norm": 0.1312393695116043, "learning_rate": 0.00016981112957431345, "loss": 0.9696, "mean_token_accuracy": 0.7539711594581604, "num_tokens": 17775406.0, "step": 545 }, { "entropy": 1.080278903245926, "epoch": 0.29197860962566846, "grad_norm": 0.13253837823867798, "learning_rate": 0.0001696945072091034, "loss": 1.0676, "mean_token_accuracy": 0.735523447394371, "num_tokens": 17807650.0, "step": 546 }, { "entropy": 0.9899517744779587, "epoch": 0.29251336898395724, "grad_norm": 0.12947142124176025, "learning_rate": 0.00016957770560246344, "loss": 0.9899, "mean_token_accuracy": 0.75409334897995, "num_tokens": 17840418.0, "step": 547 }, { "entropy": 0.9981743395328522, "epoch": 0.293048128342246, "grad_norm": 0.1302729994058609, "learning_rate": 0.00016946072510510733, "loss": 1.0118, "mean_token_accuracy": 0.747682735323906, "num_tokens": 17873143.0, "step": 548 }, { "entropy": 0.9922465980052948, "epoch": 0.29358288770053476, "grad_norm": 0.13240595161914825, "learning_rate": 0.00016934356606828604, "loss": 1.0037, "mean_token_accuracy": 0.7481169998645782, "num_tokens": 17905764.0, "step": 549 }, { "entropy": 1.0052950233221054, "epoch": 0.29411764705882354, "grad_norm": 0.13417880237102509, "learning_rate": 0.0001692262288437866, "loss": 1.0208, "mean_token_accuracy": 0.7440185844898224, "num_tokens": 17938223.0, "step": 550 }, { "entropy": 1.037492722272873, "epoch": 0.2946524064171123, "grad_norm": 0.23050248622894287, "learning_rate": 0.00016910871378393108, "loss": 1.0561, "mean_token_accuracy": 0.7388520389795303, "num_tokens": 17970901.0, "step": 551 }, { "entropy": 1.0313400328159332, "epoch": 0.29518716577540105, "grad_norm": 0.13766978681087494, "learning_rate": 0.00016899102124157548, "loss": 1.0446, "mean_token_accuracy": 0.7397971600294113, "num_tokens": 18003669.0, "step": 552 }, { "entropy": 1.0375900715589523, "epoch": 0.29572192513368983, "grad_norm": 0.13091754913330078, "learning_rate": 0.00016887315157010878, "loss": 1.038, "mean_token_accuracy": 0.7395756989717484, "num_tokens": 18036436.0, "step": 553 }, { "entropy": 1.039006382226944, "epoch": 0.2962566844919786, "grad_norm": 0.1290474236011505, "learning_rate": 0.0001687551051234518, "loss": 1.0344, "mean_token_accuracy": 0.7386363744735718, "num_tokens": 18069204.0, "step": 554 }, { "entropy": 0.9995266646146774, "epoch": 0.2967914438502674, "grad_norm": 0.13149632513523102, "learning_rate": 0.00016863688225605614, "loss": 0.9852, "mean_token_accuracy": 0.7554068863391876, "num_tokens": 18101972.0, "step": 555 }, { "entropy": 1.0221814066171646, "epoch": 0.29732620320855613, "grad_norm": 0.15010221302509308, "learning_rate": 0.00016851848332290312, "loss": 1.0073, "mean_token_accuracy": 0.7423326075077057, "num_tokens": 18134740.0, "step": 556 }, { "entropy": 1.008228451013565, "epoch": 0.2978609625668449, "grad_norm": 0.1910933405160904, "learning_rate": 0.00016839990867950275, "loss": 1.0087, "mean_token_accuracy": 0.7485948204994202, "num_tokens": 18167508.0, "step": 557 }, { "entropy": 0.9887708127498627, "epoch": 0.2983957219251337, "grad_norm": 0.23146788775920868, "learning_rate": 0.0001682811586818926, "loss": 0.9947, "mean_token_accuracy": 0.7519334852695465, "num_tokens": 18200202.0, "step": 558 }, { "entropy": 1.0437268614768982, "epoch": 0.2989304812834225, "grad_norm": 0.222130686044693, "learning_rate": 0.00016816223368663682, "loss": 1.0384, "mean_token_accuracy": 0.7401331961154938, "num_tokens": 18232970.0, "step": 559 }, { "entropy": 1.046816885471344, "epoch": 0.2994652406417112, "grad_norm": 0.163495272397995, "learning_rate": 0.00016804313405082498, "loss": 1.0507, "mean_token_accuracy": 0.7394961565732956, "num_tokens": 18265492.0, "step": 560 }, { "entropy": 1.0264595448970795, "epoch": 0.3, "grad_norm": 0.1313883364200592, "learning_rate": 0.000167923860132071, "loss": 1.0323, "mean_token_accuracy": 0.7444403767585754, "num_tokens": 18298260.0, "step": 561 }, { "entropy": 1.0883614718914032, "epoch": 0.30053475935828877, "grad_norm": 0.16005565226078033, "learning_rate": 0.00016780441228851224, "loss": 1.0815, "mean_token_accuracy": 0.7313090562820435, "num_tokens": 18330777.0, "step": 562 }, { "entropy": 1.0424848198890686, "epoch": 0.30106951871657756, "grad_norm": 0.14987064898014069, "learning_rate": 0.00016768479087880814, "loss": 1.034, "mean_token_accuracy": 0.7385043501853943, "num_tokens": 18363464.0, "step": 563 }, { "entropy": 1.0108812600374222, "epoch": 0.3016042780748663, "grad_norm": 0.13205383718013763, "learning_rate": 0.00016756499626213934, "loss": 0.9954, "mean_token_accuracy": 0.7490835785865784, "num_tokens": 18396232.0, "step": 564 }, { "entropy": 1.0051104724407196, "epoch": 0.30213903743315507, "grad_norm": 0.1460045427083969, "learning_rate": 0.00016744502879820658, "loss": 0.9975, "mean_token_accuracy": 0.7545073330402374, "num_tokens": 18428843.0, "step": 565 }, { "entropy": 1.0304394662380219, "epoch": 0.30267379679144385, "grad_norm": 0.13122116029262543, "learning_rate": 0.00016732488884722962, "loss": 1.028, "mean_token_accuracy": 0.7366521507501602, "num_tokens": 18461495.0, "step": 566 }, { "entropy": 0.990816205739975, "epoch": 0.30320855614973263, "grad_norm": 0.13677483797073364, "learning_rate": 0.00016720457676994616, "loss": 0.9934, "mean_token_accuracy": 0.749927431344986, "num_tokens": 18494227.0, "step": 567 }, { "entropy": 1.0301152765750885, "epoch": 0.3037433155080214, "grad_norm": 0.13701874017715454, "learning_rate": 0.0001670840929276106, "loss": 1.0299, "mean_token_accuracy": 0.7401285469532013, "num_tokens": 18526964.0, "step": 568 }, { "entropy": 0.9530722349882126, "epoch": 0.30427807486631014, "grad_norm": 0.1365976631641388, "learning_rate": 0.00016696343768199328, "loss": 0.9612, "mean_token_accuracy": 0.7563553899526596, "num_tokens": 18559139.0, "step": 569 }, { "entropy": 0.97348852455616, "epoch": 0.3048128342245989, "grad_norm": 0.150132954120636, "learning_rate": 0.000166842611395379, "loss": 0.9953, "mean_token_accuracy": 0.7487475574016571, "num_tokens": 18591907.0, "step": 570 }, { "entropy": 1.022206872701645, "epoch": 0.3053475935828877, "grad_norm": 0.133019357919693, "learning_rate": 0.00016672161443056634, "loss": 1.0462, "mean_token_accuracy": 0.7444709241390228, "num_tokens": 18624675.0, "step": 571 }, { "entropy": 1.0235802233219147, "epoch": 0.3058823529411765, "grad_norm": 0.14455930888652802, "learning_rate": 0.00016660044715086615, "loss": 1.0421, "mean_token_accuracy": 0.7430094182491302, "num_tokens": 18656296.0, "step": 572 }, { "entropy": 1.021704226732254, "epoch": 0.3064171122994652, "grad_norm": 0.13559989631175995, "learning_rate": 0.00016647910992010088, "loss": 1.0196, "mean_token_accuracy": 0.7471896409988403, "num_tokens": 18689064.0, "step": 573 }, { "entropy": 1.0069758594036102, "epoch": 0.306951871657754, "grad_norm": 0.1325547844171524, "learning_rate": 0.00016635760310260318, "loss": 1.0036, "mean_token_accuracy": 0.7480007261037827, "num_tokens": 18721775.0, "step": 574 }, { "entropy": 1.0201173424720764, "epoch": 0.3074866310160428, "grad_norm": 0.14399488270282745, "learning_rate": 0.00016623592706321491, "loss": 1.0093, "mean_token_accuracy": 0.7490224838256836, "num_tokens": 18754543.0, "step": 575 }, { "entropy": 1.0243518650531769, "epoch": 0.30802139037433157, "grad_norm": 0.12975279986858368, "learning_rate": 0.00016611408216728603, "loss": 1.0159, "mean_token_accuracy": 0.7417757958173752, "num_tokens": 18787081.0, "step": 576 }, { "entropy": 0.952204629778862, "epoch": 0.3085561497326203, "grad_norm": 0.13181157410144806, "learning_rate": 0.00016599206878067355, "loss": 0.9344, "mean_token_accuracy": 0.7643770426511765, "num_tokens": 18819834.0, "step": 577 }, { "entropy": 0.9844817519187927, "epoch": 0.3090909090909091, "grad_norm": 0.1284974217414856, "learning_rate": 0.00016586988726974042, "loss": 0.9761, "mean_token_accuracy": 0.7529546320438385, "num_tokens": 18852582.0, "step": 578 }, { "entropy": 1.0278932452201843, "epoch": 0.30962566844919787, "grad_norm": 0.1554325520992279, "learning_rate": 0.00016574753800135437, "loss": 1.0216, "mean_token_accuracy": 0.7442286312580109, "num_tokens": 18885047.0, "step": 579 }, { "entropy": 1.012410581111908, "epoch": 0.31016042780748665, "grad_norm": 0.12814141809940338, "learning_rate": 0.00016562502134288685, "loss": 1.0062, "mean_token_accuracy": 0.7505649775266647, "num_tokens": 18917677.0, "step": 580 }, { "entropy": 1.0278237909078598, "epoch": 0.3106951871657754, "grad_norm": 0.12905891239643097, "learning_rate": 0.00016550233766221185, "loss": 1.0384, "mean_token_accuracy": 0.7401637434959412, "num_tokens": 18950445.0, "step": 581 }, { "entropy": 1.0323994159698486, "epoch": 0.31122994652406416, "grad_norm": 0.13156746327877045, "learning_rate": 0.00016537948732770508, "loss": 1.0529, "mean_token_accuracy": 0.7383771538734436, "num_tokens": 18983096.0, "step": 582 }, { "entropy": 1.0405890196561813, "epoch": 0.31176470588235294, "grad_norm": 0.13030070066452026, "learning_rate": 0.0001652564707082424, "loss": 1.0401, "mean_token_accuracy": 0.7420965284109116, "num_tokens": 19015394.0, "step": 583 }, { "entropy": 1.0359513461589813, "epoch": 0.3122994652406417, "grad_norm": 0.12979918718338013, "learning_rate": 0.00016513328817319915, "loss": 1.0324, "mean_token_accuracy": 0.7427784353494644, "num_tokens": 19048014.0, "step": 584 }, { "entropy": 1.0870505571365356, "epoch": 0.31283422459893045, "grad_norm": 0.13175484538078308, "learning_rate": 0.00016500994009244877, "loss": 1.0817, "mean_token_accuracy": 0.7303274869918823, "num_tokens": 19080782.0, "step": 585 }, { "entropy": 1.0574510097503662, "epoch": 0.31336898395721924, "grad_norm": 0.12824955582618713, "learning_rate": 0.00016488642683636174, "loss": 1.0582, "mean_token_accuracy": 0.7358871102333069, "num_tokens": 19113550.0, "step": 586 }, { "entropy": 1.059159204363823, "epoch": 0.313903743315508, "grad_norm": 0.13426367938518524, "learning_rate": 0.00016476274877580462, "loss": 1.0562, "mean_token_accuracy": 0.7356574982404709, "num_tokens": 19145703.0, "step": 587 }, { "entropy": 1.0259163677692413, "epoch": 0.3144385026737968, "grad_norm": 0.13646824657917023, "learning_rate": 0.00016463890628213874, "loss": 1.0073, "mean_token_accuracy": 0.74399633705616, "num_tokens": 19178333.0, "step": 588 }, { "entropy": 1.020667016506195, "epoch": 0.3149732620320856, "grad_norm": 0.12998659908771515, "learning_rate": 0.00016451489972721915, "loss": 1.007, "mean_token_accuracy": 0.7440432608127594, "num_tokens": 19211101.0, "step": 589 }, { "entropy": 1.0039927810430527, "epoch": 0.3155080213903743, "grad_norm": 0.13191555440425873, "learning_rate": 0.00016439072948339358, "loss": 1.0113, "mean_token_accuracy": 0.7398888170719147, "num_tokens": 19243869.0, "step": 590 }, { "entropy": 1.0305038839578629, "epoch": 0.3160427807486631, "grad_norm": 0.13551065325737, "learning_rate": 0.00016426639592350122, "loss": 1.0333, "mean_token_accuracy": 0.7388502061367035, "num_tokens": 19276637.0, "step": 591 }, { "entropy": 0.9789949357509613, "epoch": 0.3165775401069519, "grad_norm": 0.1322057992219925, "learning_rate": 0.00016414189942087163, "loss": 0.9724, "mean_token_accuracy": 0.7544249892234802, "num_tokens": 19308982.0, "step": 592 }, { "entropy": 1.0381294786930084, "epoch": 0.31711229946524067, "grad_norm": 0.13753201067447662, "learning_rate": 0.0001640172403493237, "loss": 1.0481, "mean_token_accuracy": 0.7402539104223251, "num_tokens": 19341114.0, "step": 593 }, { "entropy": 1.008999764919281, "epoch": 0.3176470588235294, "grad_norm": 0.148467019200325, "learning_rate": 0.0001638924190831643, "loss": 1.0246, "mean_token_accuracy": 0.7437377870082855, "num_tokens": 19373882.0, "step": 594 }, { "entropy": 0.9896555691957474, "epoch": 0.3181818181818182, "grad_norm": 0.1336461752653122, "learning_rate": 0.00016376743599718748, "loss": 0.9853, "mean_token_accuracy": 0.7501374632120132, "num_tokens": 19406386.0, "step": 595 }, { "entropy": 1.028982162475586, "epoch": 0.31871657754010696, "grad_norm": 0.13734780251979828, "learning_rate": 0.00016364229146667312, "loss": 1.0014, "mean_token_accuracy": 0.7460211962461472, "num_tokens": 19439068.0, "step": 596 }, { "entropy": 1.0380202233791351, "epoch": 0.31925133689839574, "grad_norm": 0.15408451855182648, "learning_rate": 0.00016351698586738583, "loss": 1.0402, "mean_token_accuracy": 0.7399239391088486, "num_tokens": 19471403.0, "step": 597 }, { "entropy": 0.9874198585748672, "epoch": 0.31978609625668447, "grad_norm": 0.14568470418453217, "learning_rate": 0.00016339151957557384, "loss": 0.9877, "mean_token_accuracy": 0.7511913478374481, "num_tokens": 19504171.0, "step": 598 }, { "entropy": 1.002236232161522, "epoch": 0.32032085561497325, "grad_norm": 0.13177983462810516, "learning_rate": 0.00016326589296796802, "loss": 1.0073, "mean_token_accuracy": 0.7449902296066284, "num_tokens": 19536939.0, "step": 599 }, { "entropy": 0.982133075594902, "epoch": 0.32085561497326204, "grad_norm": 0.14566829800605774, "learning_rate": 0.00016314010642178042, "loss": 0.9991, "mean_token_accuracy": 0.751496821641922, "num_tokens": 19569707.0, "step": 600 }, { "entropy": 1.0116736739873886, "epoch": 0.3213903743315508, "grad_norm": 0.15575391054153442, "learning_rate": 0.00016301416031470342, "loss": 1.0168, "mean_token_accuracy": 0.7445014715194702, "num_tokens": 19602475.0, "step": 601 }, { "entropy": 1.0237414687871933, "epoch": 0.32192513368983955, "grad_norm": 0.1417372077703476, "learning_rate": 0.00016288805502490857, "loss": 1.024, "mean_token_accuracy": 0.7416205853223801, "num_tokens": 19635118.0, "step": 602 }, { "entropy": 1.002694696187973, "epoch": 0.32245989304812833, "grad_norm": 0.13840502500534058, "learning_rate": 0.00016276179093104528, "loss": 1.0029, "mean_token_accuracy": 0.7493388056755066, "num_tokens": 19667570.0, "step": 603 }, { "entropy": 1.0062010288238525, "epoch": 0.3229946524064171, "grad_norm": 0.1401420682668686, "learning_rate": 0.00016263536841223982, "loss": 1.0024, "mean_token_accuracy": 0.7453262507915497, "num_tokens": 19700338.0, "step": 604 }, { "entropy": 1.0327490121126175, "epoch": 0.3235294117647059, "grad_norm": 0.1464383602142334, "learning_rate": 0.0001625087878480942, "loss": 1.0259, "mean_token_accuracy": 0.7398741096258163, "num_tokens": 19731972.0, "step": 605 }, { "entropy": 0.9721281826496124, "epoch": 0.3240641711229946, "grad_norm": 0.14435307681560516, "learning_rate": 0.00016238204961868493, "loss": 0.9685, "mean_token_accuracy": 0.7569648027420044, "num_tokens": 19764740.0, "step": 606 }, { "entropy": 1.028033286333084, "epoch": 0.3245989304812834, "grad_norm": 0.13649606704711914, "learning_rate": 0.00016225515410456197, "loss": 1.0172, "mean_token_accuracy": 0.7414901256561279, "num_tokens": 19797454.0, "step": 607 }, { "entropy": 1.0394579023122787, "epoch": 0.3251336898395722, "grad_norm": 0.12738116085529327, "learning_rate": 0.00016212810168674753, "loss": 1.0294, "mean_token_accuracy": 0.7402873635292053, "num_tokens": 19830177.0, "step": 608 }, { "entropy": 0.9974191337823868, "epoch": 0.325668449197861, "grad_norm": 0.13835933804512024, "learning_rate": 0.00016200089274673492, "loss": 0.984, "mean_token_accuracy": 0.748787984251976, "num_tokens": 19862661.0, "step": 609 }, { "entropy": 0.9926651865243912, "epoch": 0.32620320855614976, "grad_norm": 0.1357496827840805, "learning_rate": 0.00016187352766648753, "loss": 0.9856, "mean_token_accuracy": 0.752595990896225, "num_tokens": 19895402.0, "step": 610 }, { "entropy": 0.9659290164709091, "epoch": 0.3267379679144385, "grad_norm": 0.1416723132133484, "learning_rate": 0.00016174600682843747, "loss": 0.9723, "mean_token_accuracy": 0.7547358125448227, "num_tokens": 19927869.0, "step": 611 }, { "entropy": 1.0146066099405289, "epoch": 0.32727272727272727, "grad_norm": 0.15567292273044586, "learning_rate": 0.00016161833061548463, "loss": 1.024, "mean_token_accuracy": 0.7476064115762711, "num_tokens": 19960378.0, "step": 612 }, { "entropy": 0.9735582172870636, "epoch": 0.32780748663101605, "grad_norm": 0.16228708624839783, "learning_rate": 0.00016149049941099528, "loss": 0.9995, "mean_token_accuracy": 0.7474645674228668, "num_tokens": 19993146.0, "step": 613 }, { "entropy": 0.9890165776014328, "epoch": 0.32834224598930484, "grad_norm": 0.14233818650245667, "learning_rate": 0.00016136251359880128, "loss": 1.0019, "mean_token_accuracy": 0.7476700097322464, "num_tokens": 20025356.0, "step": 614 }, { "entropy": 1.0183548629283905, "epoch": 0.32887700534759357, "grad_norm": 0.1370004415512085, "learning_rate": 0.00016123437356319859, "loss": 1.0333, "mean_token_accuracy": 0.7452346086502075, "num_tokens": 20058124.0, "step": 615 }, { "entropy": 0.9772834926843643, "epoch": 0.32941176470588235, "grad_norm": 0.13862505555152893, "learning_rate": 0.0001611060796889463, "loss": 0.9591, "mean_token_accuracy": 0.7551965862512589, "num_tokens": 20090631.0, "step": 616 }, { "entropy": 1.0807088017463684, "epoch": 0.32994652406417113, "grad_norm": 0.13603933155536652, "learning_rate": 0.0001609776323612654, "loss": 1.0786, "mean_token_accuracy": 0.7374144792556763, "num_tokens": 20123399.0, "step": 617 }, { "entropy": 1.0449609160423279, "epoch": 0.3304812834224599, "grad_norm": 0.13282941281795502, "learning_rate": 0.0001608490319658376, "loss": 1.0362, "mean_token_accuracy": 0.7421600371599197, "num_tokens": 20155570.0, "step": 618 }, { "entropy": 1.0110503286123276, "epoch": 0.33101604278074864, "grad_norm": 0.13819116353988647, "learning_rate": 0.00016072027888880438, "loss": 0.9944, "mean_token_accuracy": 0.750076025724411, "num_tokens": 20187984.0, "step": 619 }, { "entropy": 1.037804901599884, "epoch": 0.3315508021390374, "grad_norm": 0.13222205638885498, "learning_rate": 0.00016059137351676542, "loss": 1.0153, "mean_token_accuracy": 0.7441959977149963, "num_tokens": 20220752.0, "step": 620 }, { "entropy": 1.0193375647068024, "epoch": 0.3320855614973262, "grad_norm": 0.13431870937347412, "learning_rate": 0.00016046231623677797, "loss": 1.0021, "mean_token_accuracy": 0.7499389052391052, "num_tokens": 20253520.0, "step": 621 }, { "entropy": 1.0201116502285004, "epoch": 0.332620320855615, "grad_norm": 0.13786272704601288, "learning_rate": 0.00016033310743635514, "loss": 1.0337, "mean_token_accuracy": 0.7389914989471436, "num_tokens": 20286123.0, "step": 622 }, { "entropy": 1.0212906897068024, "epoch": 0.3331550802139037, "grad_norm": 0.1344262808561325, "learning_rate": 0.0001602037475034652, "loss": 1.0276, "mean_token_accuracy": 0.7460288405418396, "num_tokens": 20318891.0, "step": 623 }, { "entropy": 0.9826373159885406, "epoch": 0.3336898395721925, "grad_norm": 0.1419702172279358, "learning_rate": 0.00016007423682653008, "loss": 0.9947, "mean_token_accuracy": 0.7482587993144989, "num_tokens": 20351659.0, "step": 624 }, { "entropy": 1.0362474918365479, "epoch": 0.3342245989304813, "grad_norm": 0.14024557173252106, "learning_rate": 0.00015994457579442443, "loss": 1.0406, "mean_token_accuracy": 0.7413856387138367, "num_tokens": 20384427.0, "step": 625 }, { "entropy": 0.9735687077045441, "epoch": 0.33475935828877007, "grad_norm": 0.13640111684799194, "learning_rate": 0.00015981476479647437, "loss": 0.9725, "mean_token_accuracy": 0.755733072757721, "num_tokens": 20417153.0, "step": 626 }, { "entropy": 1.0228614211082458, "epoch": 0.3352941176470588, "grad_norm": 0.14213134348392487, "learning_rate": 0.00015968480422245626, "loss": 1.0324, "mean_token_accuracy": 0.7432184815406799, "num_tokens": 20449921.0, "step": 627 }, { "entropy": 1.0295179784297943, "epoch": 0.3358288770053476, "grad_norm": 0.1468447744846344, "learning_rate": 0.00015955469446259557, "loss": 1.0245, "mean_token_accuracy": 0.7425564378499985, "num_tokens": 20482396.0, "step": 628 }, { "entropy": 1.0403719544410706, "epoch": 0.33636363636363636, "grad_norm": 0.12881560623645782, "learning_rate": 0.0001594244359075658, "loss": 1.0434, "mean_token_accuracy": 0.7414287030696869, "num_tokens": 20515037.0, "step": 629 }, { "entropy": 0.9976829290390015, "epoch": 0.33689839572192515, "grad_norm": 0.15784220397472382, "learning_rate": 0.00015929402894848714, "loss": 0.9934, "mean_token_accuracy": 0.7500075548887253, "num_tokens": 20547082.0, "step": 630 }, { "entropy": 1.0174478143453598, "epoch": 0.33743315508021393, "grad_norm": 0.16386115550994873, "learning_rate": 0.00015916347397692548, "loss": 1.0229, "mean_token_accuracy": 0.7399440705776215, "num_tokens": 20579826.0, "step": 631 }, { "entropy": 0.9906077682971954, "epoch": 0.33796791443850266, "grad_norm": 0.13278624415397644, "learning_rate": 0.00015903277138489105, "loss": 0.9912, "mean_token_accuracy": 0.7534213066101074, "num_tokens": 20612594.0, "step": 632 }, { "entropy": 1.0336619019508362, "epoch": 0.33850267379679144, "grad_norm": 0.16966547071933746, "learning_rate": 0.00015890192156483738, "loss": 1.0395, "mean_token_accuracy": 0.7384225428104401, "num_tokens": 20645362.0, "step": 633 }, { "entropy": 1.0511770248413086, "epoch": 0.3390374331550802, "grad_norm": 0.16399900615215302, "learning_rate": 0.00015877092490966006, "loss": 1.0482, "mean_token_accuracy": 0.7364983707666397, "num_tokens": 20677920.0, "step": 634 }, { "entropy": 1.0212816298007965, "epoch": 0.339572192513369, "grad_norm": 0.13357484340667725, "learning_rate": 0.00015863978181269558, "loss": 1.009, "mean_token_accuracy": 0.7460338771343231, "num_tokens": 20710466.0, "step": 635 }, { "entropy": 1.0000340044498444, "epoch": 0.34010695187165774, "grad_norm": 0.15471495687961578, "learning_rate": 0.00015850849266772017, "loss": 0.9948, "mean_token_accuracy": 0.7509164214134216, "num_tokens": 20743234.0, "step": 636 }, { "entropy": 1.0232605040073395, "epoch": 0.3406417112299465, "grad_norm": 0.1373874694108963, "learning_rate": 0.0001583770578689485, "loss": 1.0167, "mean_token_accuracy": 0.7430351972579956, "num_tokens": 20776002.0, "step": 637 }, { "entropy": 1.0646322965621948, "epoch": 0.3411764705882353, "grad_norm": 0.14697957038879395, "learning_rate": 0.00015824547781103268, "loss": 1.072, "mean_token_accuracy": 0.7312438935041428, "num_tokens": 20808770.0, "step": 638 }, { "entropy": 1.0304863154888153, "epoch": 0.3417112299465241, "grad_norm": 0.16963070631027222, "learning_rate": 0.00015811375288906097, "loss": 1.036, "mean_token_accuracy": 0.7408357858657837, "num_tokens": 20841538.0, "step": 639 }, { "entropy": 0.9575765281915665, "epoch": 0.3422459893048128, "grad_norm": 0.14465726912021637, "learning_rate": 0.00015798188349855659, "loss": 0.954, "mean_token_accuracy": 0.7573956996202469, "num_tokens": 20874027.0, "step": 640 }, { "entropy": 0.9817063212394714, "epoch": 0.3427807486631016, "grad_norm": 0.13434670865535736, "learning_rate": 0.0001578498700354765, "loss": 0.9651, "mean_token_accuracy": 0.7546056061983109, "num_tokens": 20906619.0, "step": 641 }, { "entropy": 1.0032513439655304, "epoch": 0.3433155080213904, "grad_norm": 0.15078270435333252, "learning_rate": 0.00015771771289621028, "loss": 0.997, "mean_token_accuracy": 0.7474170327186584, "num_tokens": 20938677.0, "step": 642 }, { "entropy": 1.0587093532085419, "epoch": 0.34385026737967916, "grad_norm": 0.1376420557498932, "learning_rate": 0.00015758541247757902, "loss": 1.0776, "mean_token_accuracy": 0.7357954680919647, "num_tokens": 20971445.0, "step": 643 }, { "entropy": 1.0180152505636215, "epoch": 0.3443850267379679, "grad_norm": 0.147924542427063, "learning_rate": 0.00015745296917683388, "loss": 1.0215, "mean_token_accuracy": 0.7416360229253769, "num_tokens": 21003936.0, "step": 644 }, { "entropy": 0.9809147417545319, "epoch": 0.3449197860962567, "grad_norm": 0.14757901430130005, "learning_rate": 0.0001573203833916551, "loss": 0.9837, "mean_token_accuracy": 0.7514662742614746, "num_tokens": 21036704.0, "step": 645 }, { "entropy": 1.0416185408830643, "epoch": 0.34545454545454546, "grad_norm": 0.14264920353889465, "learning_rate": 0.00015718765552015087, "loss": 1.0426, "mean_token_accuracy": 0.7363147735595703, "num_tokens": 21069472.0, "step": 646 }, { "entropy": 0.9874245077371597, "epoch": 0.34598930481283424, "grad_norm": 0.14516086876392365, "learning_rate": 0.00015705478596085578, "loss": 0.9906, "mean_token_accuracy": 0.7513746321201324, "num_tokens": 21102240.0, "step": 647 }, { "entropy": 1.0033420026302338, "epoch": 0.34652406417112297, "grad_norm": 0.12710031867027283, "learning_rate": 0.00015692177511273005, "loss": 1.0026, "mean_token_accuracy": 0.7465786933898926, "num_tokens": 21135008.0, "step": 648 }, { "entropy": 1.0446359515190125, "epoch": 0.34705882352941175, "grad_norm": 0.13629251718521118, "learning_rate": 0.00015678862337515807, "loss": 1.0441, "mean_token_accuracy": 0.7418743968009949, "num_tokens": 21167776.0, "step": 649 }, { "entropy": 0.9714336842298508, "epoch": 0.34759358288770054, "grad_norm": 0.1313450187444687, "learning_rate": 0.0001566553311479473, "loss": 0.9537, "mean_token_accuracy": 0.7604472041130066, "num_tokens": 21200544.0, "step": 650 }, { "entropy": 1.0382590889930725, "epoch": 0.3481283422459893, "grad_norm": 0.141065776348114, "learning_rate": 0.0001565218988313269, "loss": 1.0306, "mean_token_accuracy": 0.7439709454774857, "num_tokens": 21233284.0, "step": 651 }, { "entropy": 1.055101215839386, "epoch": 0.34866310160427805, "grad_norm": 0.1360592395067215, "learning_rate": 0.00015638832682594693, "loss": 1.065, "mean_token_accuracy": 0.7342474460601807, "num_tokens": 21265631.0, "step": 652 }, { "entropy": 0.9873165488243103, "epoch": 0.34919786096256683, "grad_norm": 0.13543623685836792, "learning_rate": 0.0001562546155328766, "loss": 0.9769, "mean_token_accuracy": 0.7527990341186523, "num_tokens": 21297750.0, "step": 653 }, { "entropy": 1.0083692371845245, "epoch": 0.3497326203208556, "grad_norm": 0.13810250163078308, "learning_rate": 0.00015612076535360364, "loss": 1.0123, "mean_token_accuracy": 0.7467314302921295, "num_tokens": 21330518.0, "step": 654 }, { "entropy": 0.9826506972312927, "epoch": 0.3502673796791444, "grad_norm": 0.1295788437128067, "learning_rate": 0.00015598677669003262, "loss": 0.9971, "mean_token_accuracy": 0.7486559152603149, "num_tokens": 21363286.0, "step": 655 }, { "entropy": 1.007062405347824, "epoch": 0.3508021390374332, "grad_norm": 0.14045320451259613, "learning_rate": 0.00015585264994448386, "loss": 1.0114, "mean_token_accuracy": 0.7455706298351288, "num_tokens": 21396054.0, "step": 656 }, { "entropy": 0.9903112798929214, "epoch": 0.3513368983957219, "grad_norm": 0.13183598220348358, "learning_rate": 0.00015571838551969248, "loss": 0.9828, "mean_token_accuracy": 0.7507942318916321, "num_tokens": 21428822.0, "step": 657 }, { "entropy": 0.9780847728252411, "epoch": 0.3518716577540107, "grad_norm": 0.13663578033447266, "learning_rate": 0.00015558398381880693, "loss": 0.9698, "mean_token_accuracy": 0.7527776211500168, "num_tokens": 21460828.0, "step": 658 }, { "entropy": 1.0236111879348755, "epoch": 0.3524064171122995, "grad_norm": 0.14290697872638702, "learning_rate": 0.00015544944524538784, "loss": 1.028, "mean_token_accuracy": 0.7419543862342834, "num_tokens": 21493501.0, "step": 659 }, { "entropy": 1.0137494504451752, "epoch": 0.35294117647058826, "grad_norm": 0.13214318454265594, "learning_rate": 0.0001553147702034068, "loss": 1.0105, "mean_token_accuracy": 0.7438588291406631, "num_tokens": 21525788.0, "step": 660 }, { "entropy": 1.0354048311710358, "epoch": 0.353475935828877, "grad_norm": 0.13967375457286835, "learning_rate": 0.0001551799590972452, "loss": 1.0365, "mean_token_accuracy": 0.7424995005130768, "num_tokens": 21558272.0, "step": 661 }, { "entropy": 0.9711843430995941, "epoch": 0.35401069518716577, "grad_norm": 0.13694122433662415, "learning_rate": 0.00015504501233169296, "loss": 0.9785, "mean_token_accuracy": 0.7561705857515335, "num_tokens": 21591040.0, "step": 662 }, { "entropy": 1.0343670547008514, "epoch": 0.35454545454545455, "grad_norm": 0.13821734488010406, "learning_rate": 0.00015490993031194735, "loss": 1.0318, "mean_token_accuracy": 0.7411490827798843, "num_tokens": 21623615.0, "step": 663 }, { "entropy": 0.9731929302215576, "epoch": 0.35508021390374334, "grad_norm": 0.14156140387058258, "learning_rate": 0.00015477471344361177, "loss": 0.9659, "mean_token_accuracy": 0.7541355192661285, "num_tokens": 21656050.0, "step": 664 }, { "entropy": 1.0352194905281067, "epoch": 0.35561497326203206, "grad_norm": 0.13904334604740143, "learning_rate": 0.0001546393621326945, "loss": 1.0232, "mean_token_accuracy": 0.7438294291496277, "num_tokens": 21688818.0, "step": 665 }, { "entropy": 1.0565296709537506, "epoch": 0.35614973262032085, "grad_norm": 0.155804842710495, "learning_rate": 0.0001545038767856075, "loss": 1.0565, "mean_token_accuracy": 0.7359127998352051, "num_tokens": 21721418.0, "step": 666 }, { "entropy": 1.0352788120508194, "epoch": 0.35668449197860963, "grad_norm": 0.15668074786663055, "learning_rate": 0.00015436825780916523, "loss": 1.0307, "mean_token_accuracy": 0.7426380813121796, "num_tokens": 21754186.0, "step": 667 }, { "entropy": 0.9863010793924332, "epoch": 0.3572192513368984, "grad_norm": 0.13319331407546997, "learning_rate": 0.00015423250561058337, "loss": 0.9948, "mean_token_accuracy": 0.7499694526195526, "num_tokens": 21786954.0, "step": 668 }, { "entropy": 1.0410586893558502, "epoch": 0.35775401069518714, "grad_norm": 0.15658292174339294, "learning_rate": 0.00015409662059747762, "loss": 1.0425, "mean_token_accuracy": 0.7384836375713348, "num_tokens": 21819722.0, "step": 669 }, { "entropy": 1.0100665241479874, "epoch": 0.3582887700534759, "grad_norm": 0.17908963561058044, "learning_rate": 0.00015396060317786247, "loss": 1.0263, "mean_token_accuracy": 0.7435024678707123, "num_tokens": 21852173.0, "step": 670 }, { "entropy": 1.04761204123497, "epoch": 0.3588235294117647, "grad_norm": 0.1655554324388504, "learning_rate": 0.00015382445376014995, "loss": 1.0588, "mean_token_accuracy": 0.7338538020849228, "num_tokens": 21884869.0, "step": 671 }, { "entropy": 0.9861435741186142, "epoch": 0.3593582887700535, "grad_norm": 0.13408905267715454, "learning_rate": 0.00015368817275314848, "loss": 0.9848, "mean_token_accuracy": 0.7526270747184753, "num_tokens": 21917637.0, "step": 672 }, { "entropy": 1.0289635956287384, "epoch": 0.3598930481283422, "grad_norm": 0.14396777749061584, "learning_rate": 0.0001535517605660616, "loss": 1.0321, "mean_token_accuracy": 0.7429722100496292, "num_tokens": 21950393.0, "step": 673 }, { "entropy": 1.0139815360307693, "epoch": 0.360427807486631, "grad_norm": 0.47777441143989563, "learning_rate": 0.00015341521760848674, "loss": 0.9951, "mean_token_accuracy": 0.7478394359350204, "num_tokens": 21982768.0, "step": 674 }, { "entropy": 1.0153920203447342, "epoch": 0.3609625668449198, "grad_norm": 0.16766510903835297, "learning_rate": 0.0001532785442904139, "loss": 1.0223, "mean_token_accuracy": 0.7463888227939606, "num_tokens": 22015316.0, "step": 675 }, { "entropy": 1.0040026307106018, "epoch": 0.36149732620320857, "grad_norm": 0.13487398624420166, "learning_rate": 0.00015314174102222462, "loss": 0.9979, "mean_token_accuracy": 0.75, "num_tokens": 22048084.0, "step": 676 }, { "entropy": 0.9781406968832016, "epoch": 0.36203208556149735, "grad_norm": 0.1550796926021576, "learning_rate": 0.00015300480821469058, "loss": 0.9712, "mean_token_accuracy": 0.755834549665451, "num_tokens": 22080852.0, "step": 677 }, { "entropy": 1.0409707725048065, "epoch": 0.3625668449197861, "grad_norm": 0.1624067723751068, "learning_rate": 0.00015286774627897246, "loss": 1.0409, "mean_token_accuracy": 0.7391927093267441, "num_tokens": 22113316.0, "step": 678 }, { "entropy": 1.0326741486787796, "epoch": 0.36310160427807486, "grad_norm": 0.13654468953609467, "learning_rate": 0.0001527305556266185, "loss": 1.0325, "mean_token_accuracy": 0.7431894689798355, "num_tokens": 22145955.0, "step": 679 }, { "entropy": 0.9604055285453796, "epoch": 0.36363636363636365, "grad_norm": 0.15321598947048187, "learning_rate": 0.00015259323666956374, "loss": 0.9473, "mean_token_accuracy": 0.7591336667537689, "num_tokens": 22178723.0, "step": 680 }, { "entropy": 0.9818018078804016, "epoch": 0.36417112299465243, "grad_norm": 0.14409194886684418, "learning_rate": 0.0001524557898201282, "loss": 0.9948, "mean_token_accuracy": 0.7462469041347504, "num_tokens": 22210814.0, "step": 681 }, { "entropy": 0.971877321600914, "epoch": 0.36470588235294116, "grad_norm": 0.13896019756793976, "learning_rate": 0.000152318215491016, "loss": 0.9756, "mean_token_accuracy": 0.7509926110506058, "num_tokens": 22243493.0, "step": 682 }, { "entropy": 1.0054988861083984, "epoch": 0.36524064171122994, "grad_norm": 0.1456078439950943, "learning_rate": 0.0001521805140953141, "loss": 1.005, "mean_token_accuracy": 0.7481652200222015, "num_tokens": 22275840.0, "step": 683 }, { "entropy": 1.0024132430553436, "epoch": 0.3657754010695187, "grad_norm": 0.1406199187040329, "learning_rate": 0.00015204268604649096, "loss": 1.0161, "mean_token_accuracy": 0.7452756762504578, "num_tokens": 22308573.0, "step": 684 }, { "entropy": 1.0235384851694107, "epoch": 0.3663101604278075, "grad_norm": 0.13761594891548157, "learning_rate": 0.00015190473175839524, "loss": 1.0248, "mean_token_accuracy": 0.7436275780200958, "num_tokens": 22341051.0, "step": 685 }, { "entropy": 0.9671759307384491, "epoch": 0.36684491978609624, "grad_norm": 0.13828864693641663, "learning_rate": 0.00015176665164525478, "loss": 0.9566, "mean_token_accuracy": 0.7577615976333618, "num_tokens": 22373581.0, "step": 686 }, { "entropy": 1.0064850896596909, "epoch": 0.367379679144385, "grad_norm": 0.13809967041015625, "learning_rate": 0.0001516284461216752, "loss": 0.9868, "mean_token_accuracy": 0.7506383806467056, "num_tokens": 22406345.0, "step": 687 }, { "entropy": 0.9803296625614166, "epoch": 0.3679144385026738, "grad_norm": 0.1352642923593521, "learning_rate": 0.0001514901156026386, "loss": 0.9801, "mean_token_accuracy": 0.7542366534471512, "num_tokens": 22438701.0, "step": 688 }, { "entropy": 1.0375657081604004, "epoch": 0.3684491978609626, "grad_norm": 0.1408119946718216, "learning_rate": 0.00015135166050350249, "loss": 1.0349, "mean_token_accuracy": 0.7384836375713348, "num_tokens": 22471469.0, "step": 689 }, { "entropy": 1.0136317312717438, "epoch": 0.3689839572192513, "grad_norm": 0.1566014438867569, "learning_rate": 0.00015121308123999846, "loss": 1.0118, "mean_token_accuracy": 0.745280846953392, "num_tokens": 22504023.0, "step": 690 }, { "entropy": 0.9935726672410965, "epoch": 0.3695187165775401, "grad_norm": 0.14247219264507294, "learning_rate": 0.00015107437822823086, "loss": 1.0105, "mean_token_accuracy": 0.7484749108552933, "num_tokens": 22536412.0, "step": 691 }, { "entropy": 1.0056940615177155, "epoch": 0.3700534759358289, "grad_norm": 0.15261436998844147, "learning_rate": 0.00015093555188467556, "loss": 0.9915, "mean_token_accuracy": 0.7517784386873245, "num_tokens": 22569101.0, "step": 692 }, { "entropy": 1.0365577340126038, "epoch": 0.37058823529411766, "grad_norm": 0.15125557780265808, "learning_rate": 0.00015079660262617888, "loss": 1.0414, "mean_token_accuracy": 0.7398726642131805, "num_tokens": 22601254.0, "step": 693 }, { "entropy": 0.9999520629644394, "epoch": 0.3711229946524064, "grad_norm": 0.14125481247901917, "learning_rate": 0.0001506575308699562, "loss": 1.0168, "mean_token_accuracy": 0.7414161711931229, "num_tokens": 22634022.0, "step": 694 }, { "entropy": 0.9776995927095413, "epoch": 0.3716577540106952, "grad_norm": 0.14051266014575958, "learning_rate": 0.00015051833703359058, "loss": 0.978, "mean_token_accuracy": 0.7500337809324265, "num_tokens": 22666733.0, "step": 695 }, { "entropy": 1.013497769832611, "epoch": 0.37219251336898396, "grad_norm": 0.1376378983259201, "learning_rate": 0.00015037902153503183, "loss": 1.0246, "mean_token_accuracy": 0.7472812831401825, "num_tokens": 22699501.0, "step": 696 }, { "entropy": 0.9666542112827301, "epoch": 0.37272727272727274, "grad_norm": 0.14466111361980438, "learning_rate": 0.00015023958479259492, "loss": 0.969, "mean_token_accuracy": 0.754028245806694, "num_tokens": 22732026.0, "step": 697 }, { "entropy": 1.0464845448732376, "epoch": 0.3732620320855615, "grad_norm": 0.13540534675121307, "learning_rate": 0.0001501000272249589, "loss": 1.049, "mean_token_accuracy": 0.7369167357683182, "num_tokens": 22764683.0, "step": 698 }, { "entropy": 0.9937262833118439, "epoch": 0.37379679144385025, "grad_norm": 0.1344163864850998, "learning_rate": 0.0001499603492511657, "loss": 0.9841, "mean_token_accuracy": 0.7498625665903091, "num_tokens": 22796797.0, "step": 699 }, { "entropy": 1.0619724094867706, "epoch": 0.37433155080213903, "grad_norm": 0.13974133133888245, "learning_rate": 0.00014982055129061868, "loss": 1.0481, "mean_token_accuracy": 0.7397315353155136, "num_tokens": 22829462.0, "step": 700 }, { "entropy": 0.986870288848877, "epoch": 0.3748663101604278, "grad_norm": 0.13458530604839325, "learning_rate": 0.00014968063376308156, "loss": 0.9795, "mean_token_accuracy": 0.7523122578859329, "num_tokens": 22861955.0, "step": 701 }, { "entropy": 1.0136100053787231, "epoch": 0.3754010695187166, "grad_norm": 0.13495780527591705, "learning_rate": 0.000149540597088677, "loss": 1.0055, "mean_token_accuracy": 0.7504582107067108, "num_tokens": 22894723.0, "step": 702 }, { "entropy": 1.0152283012866974, "epoch": 0.37593582887700533, "grad_norm": 0.1380801647901535, "learning_rate": 0.0001494004416878855, "loss": 1.0009, "mean_token_accuracy": 0.7458455562591553, "num_tokens": 22927491.0, "step": 703 }, { "entropy": 0.9783022105693817, "epoch": 0.3764705882352941, "grad_norm": 0.1381734162569046, "learning_rate": 0.00014926016798154403, "loss": 0.9774, "mean_token_accuracy": 0.752027615904808, "num_tokens": 22960110.0, "step": 704 }, { "entropy": 1.0288219153881073, "epoch": 0.3770053475935829, "grad_norm": 0.1360618621110916, "learning_rate": 0.0001491197763908447, "loss": 1.033, "mean_token_accuracy": 0.7418133020401001, "num_tokens": 22992878.0, "step": 705 }, { "entropy": 0.9818412363529205, "epoch": 0.3775401069518717, "grad_norm": 0.13849791884422302, "learning_rate": 0.0001489792673373337, "loss": 0.9687, "mean_token_accuracy": 0.7564186900854111, "num_tokens": 23025584.0, "step": 706 }, { "entropy": 0.9837158620357513, "epoch": 0.3780748663101604, "grad_norm": 0.14172674715518951, "learning_rate": 0.0001488386412429099, "loss": 0.9944, "mean_token_accuracy": 0.750454768538475, "num_tokens": 23058056.0, "step": 707 }, { "entropy": 1.0224552154541016, "epoch": 0.3786096256684492, "grad_norm": 0.14361903071403503, "learning_rate": 0.0001486978985298235, "loss": 1.0279, "mean_token_accuracy": 0.7439707517623901, "num_tokens": 23090319.0, "step": 708 }, { "entropy": 1.022420734167099, "epoch": 0.379144385026738, "grad_norm": 0.14148057997226715, "learning_rate": 0.00014855703962067502, "loss": 1.0485, "mean_token_accuracy": 0.7371670007705688, "num_tokens": 23123075.0, "step": 709 }, { "entropy": 1.0484061241149902, "epoch": 0.37967914438502676, "grad_norm": 0.14547275006771088, "learning_rate": 0.0001484160649384138, "loss": 1.0597, "mean_token_accuracy": 0.7371066063642502, "num_tokens": 23155609.0, "step": 710 }, { "entropy": 0.9728060364723206, "epoch": 0.3802139037433155, "grad_norm": 0.1438276469707489, "learning_rate": 0.0001482749749063367, "loss": 0.9747, "mean_token_accuracy": 0.7522335052490234, "num_tokens": 23188155.0, "step": 711 }, { "entropy": 1.0211612433195114, "epoch": 0.38074866310160427, "grad_norm": 0.1322088986635208, "learning_rate": 0.00014813376994808717, "loss": 1.0085, "mean_token_accuracy": 0.745509535074234, "num_tokens": 23220923.0, "step": 712 }, { "entropy": 1.035618707537651, "epoch": 0.38128342245989305, "grad_norm": 0.13468991219997406, "learning_rate": 0.0001479924504876535, "loss": 1.0273, "mean_token_accuracy": 0.7402165532112122, "num_tokens": 23253650.0, "step": 713 }, { "entropy": 1.0208526104688644, "epoch": 0.38181818181818183, "grad_norm": 0.1391787976026535, "learning_rate": 0.000147851016949368, "loss": 1.0081, "mean_token_accuracy": 0.7423631548881531, "num_tokens": 23286418.0, "step": 714 }, { "entropy": 0.9913294613361359, "epoch": 0.38235294117647056, "grad_norm": 0.13534633815288544, "learning_rate": 0.00014770946975790534, "loss": 0.9803, "mean_token_accuracy": 0.7509450316429138, "num_tokens": 23318480.0, "step": 715 }, { "entropy": 1.0779812783002853, "epoch": 0.38288770053475935, "grad_norm": 0.1359453648328781, "learning_rate": 0.00014756780933828157, "loss": 1.0667, "mean_token_accuracy": 0.7307667434215546, "num_tokens": 23351175.0, "step": 716 }, { "entropy": 0.9769474864006042, "epoch": 0.38342245989304813, "grad_norm": 0.14193221926689148, "learning_rate": 0.0001474260361158526, "loss": 0.9711, "mean_token_accuracy": 0.7550708651542664, "num_tokens": 23383943.0, "step": 717 }, { "entropy": 0.9842025190591812, "epoch": 0.3839572192513369, "grad_norm": 0.13903889060020447, "learning_rate": 0.00014728415051631325, "loss": 0.9939, "mean_token_accuracy": 0.7491141259670258, "num_tokens": 23416711.0, "step": 718 }, { "entropy": 1.0313686281442642, "epoch": 0.3844919786096257, "grad_norm": 0.15327425301074982, "learning_rate": 0.0001471421529656956, "loss": 1.0396, "mean_token_accuracy": 0.7405342608690262, "num_tokens": 23449223.0, "step": 719 }, { "entropy": 0.9921634346246719, "epoch": 0.3850267379679144, "grad_norm": 0.1574993133544922, "learning_rate": 0.00014700004389036784, "loss": 1.0025, "mean_token_accuracy": 0.7462220937013626, "num_tokens": 23481947.0, "step": 720 }, { "entropy": 0.9802977293729782, "epoch": 0.3855614973262032, "grad_norm": 0.17303505539894104, "learning_rate": 0.00014685782371703322, "loss": 0.9906, "mean_token_accuracy": 0.7480792254209518, "num_tokens": 23513871.0, "step": 721 }, { "entropy": 1.0380671918392181, "epoch": 0.386096256684492, "grad_norm": 0.1449204385280609, "learning_rate": 0.0001467154928727284, "loss": 1.0363, "mean_token_accuracy": 0.7386363595724106, "num_tokens": 23546639.0, "step": 722 }, { "entropy": 1.0341337323188782, "epoch": 0.3866310160427808, "grad_norm": 0.14379708468914032, "learning_rate": 0.00014657305178482246, "loss": 1.0431, "mean_token_accuracy": 0.741865485906601, "num_tokens": 23578906.0, "step": 723 }, { "entropy": 0.9877164959907532, "epoch": 0.3871657754010695, "grad_norm": 0.15079493820667267, "learning_rate": 0.00014643050088101545, "loss": 0.9661, "mean_token_accuracy": 0.7591642141342163, "num_tokens": 23611674.0, "step": 724 }, { "entropy": 0.9995661824941635, "epoch": 0.3877005347593583, "grad_norm": 0.16325317323207855, "learning_rate": 0.00014628784058933717, "loss": 0.9907, "mean_token_accuracy": 0.7485337257385254, "num_tokens": 23644442.0, "step": 725 }, { "entropy": 0.9665198773145676, "epoch": 0.38823529411764707, "grad_norm": 0.14961887896060944, "learning_rate": 0.00014614507133814584, "loss": 0.9425, "mean_token_accuracy": 0.7581816762685776, "num_tokens": 23676432.0, "step": 726 }, { "entropy": 1.0407246053218842, "epoch": 0.38877005347593585, "grad_norm": 0.14445669949054718, "learning_rate": 0.00014600219355612694, "loss": 1.0202, "mean_token_accuracy": 0.7398888170719147, "num_tokens": 23709200.0, "step": 727 }, { "entropy": 0.9938240647315979, "epoch": 0.3893048128342246, "grad_norm": 0.15048936009407043, "learning_rate": 0.0001458592076722917, "loss": 0.9936, "mean_token_accuracy": 0.7492668628692627, "num_tokens": 23741968.0, "step": 728 }, { "entropy": 0.9772266745567322, "epoch": 0.38983957219251336, "grad_norm": 0.1531447023153305, "learning_rate": 0.00014571611411597605, "loss": 0.9777, "mean_token_accuracy": 0.7505871802568436, "num_tokens": 23774679.0, "step": 729 }, { "entropy": 0.9953857660293579, "epoch": 0.39037433155080214, "grad_norm": 0.14569969475269318, "learning_rate": 0.00014557291331683916, "loss": 0.9935, "mean_token_accuracy": 0.7505297958850861, "num_tokens": 23807385.0, "step": 730 }, { "entropy": 0.9788328260183334, "epoch": 0.39090909090909093, "grad_norm": 0.1406850516796112, "learning_rate": 0.00014542960570486222, "loss": 0.9856, "mean_token_accuracy": 0.7500044405460358, "num_tokens": 23839661.0, "step": 731 }, { "entropy": 1.0059930682182312, "epoch": 0.39144385026737966, "grad_norm": 0.14730219542980194, "learning_rate": 0.00014528619171034717, "loss": 0.9887, "mean_token_accuracy": 0.7483198940753937, "num_tokens": 23872429.0, "step": 732 }, { "entropy": 1.0020950585603714, "epoch": 0.39197860962566844, "grad_norm": 0.14256757497787476, "learning_rate": 0.00014514267176391542, "loss": 1.0101, "mean_token_accuracy": 0.7468536198139191, "num_tokens": 23905197.0, "step": 733 }, { "entropy": 0.9817915260791779, "epoch": 0.3925133689839572, "grad_norm": 0.1441333293914795, "learning_rate": 0.00014499904629650638, "loss": 1.0013, "mean_token_accuracy": 0.7450421899557114, "num_tokens": 23937585.0, "step": 734 }, { "entropy": 1.0192552655935287, "epoch": 0.393048128342246, "grad_norm": 0.15254747867584229, "learning_rate": 0.0001448553157393764, "loss": 1.0264, "mean_token_accuracy": 0.7418911159038544, "num_tokens": 23970285.0, "step": 735 }, { "entropy": 0.996012270450592, "epoch": 0.39358288770053473, "grad_norm": 0.1529400646686554, "learning_rate": 0.00014471148052409732, "loss": 1.004, "mean_token_accuracy": 0.7430134862661362, "num_tokens": 24003021.0, "step": 736 }, { "entropy": 0.9864199012517929, "epoch": 0.3941176470588235, "grad_norm": 0.1369430273771286, "learning_rate": 0.0001445675410825554, "loss": 0.9948, "mean_token_accuracy": 0.7503091543912888, "num_tokens": 24035141.0, "step": 737 }, { "entropy": 0.977002888917923, "epoch": 0.3946524064171123, "grad_norm": 0.16152888536453247, "learning_rate": 0.00014442349784694956, "loss": 0.9748, "mean_token_accuracy": 0.7555102407932281, "num_tokens": 24067417.0, "step": 738 }, { "entropy": 1.027063712477684, "epoch": 0.3951871657754011, "grad_norm": 0.15807777643203735, "learning_rate": 0.00014427935124979068, "loss": 1.0293, "mean_token_accuracy": 0.7409990280866623, "num_tokens": 24099845.0, "step": 739 }, { "entropy": 0.9792217314243317, "epoch": 0.39572192513368987, "grad_norm": 0.13410022854804993, "learning_rate": 0.00014413510172389977, "loss": 0.9764, "mean_token_accuracy": 0.7549799382686615, "num_tokens": 24132073.0, "step": 740 }, { "entropy": 1.0001242756843567, "epoch": 0.3962566844919786, "grad_norm": 0.17179086804389954, "learning_rate": 0.00014399074970240702, "loss": 0.986, "mean_token_accuracy": 0.7496117204427719, "num_tokens": 24164170.0, "step": 741 }, { "entropy": 0.9808478057384491, "epoch": 0.3967914438502674, "grad_norm": 0.17493928968906403, "learning_rate": 0.0001438462956187503, "loss": 0.9799, "mean_token_accuracy": 0.7514575868844986, "num_tokens": 24196887.0, "step": 742 }, { "entropy": 0.9944099485874176, "epoch": 0.39732620320855616, "grad_norm": 0.14331574738025665, "learning_rate": 0.00014370173990667406, "loss": 0.9931, "mean_token_accuracy": 0.7517488449811935, "num_tokens": 24229346.0, "step": 743 }, { "entropy": 1.0133265405893326, "epoch": 0.39786096256684494, "grad_norm": 0.15669451653957367, "learning_rate": 0.00014355708300022778, "loss": 1.0169, "mean_token_accuracy": 0.7478005886077881, "num_tokens": 24262114.0, "step": 744 }, { "entropy": 0.9847605526447296, "epoch": 0.3983957219251337, "grad_norm": 0.16532684862613678, "learning_rate": 0.00014341232533376483, "loss": 0.9982, "mean_token_accuracy": 0.7511332482099533, "num_tokens": 24294705.0, "step": 745 }, { "entropy": 1.0536640584468842, "epoch": 0.39893048128342246, "grad_norm": 0.13670744001865387, "learning_rate": 0.00014326746734194114, "loss": 1.0482, "mean_token_accuracy": 0.737246185541153, "num_tokens": 24327241.0, "step": 746 }, { "entropy": 0.9804176688194275, "epoch": 0.39946524064171124, "grad_norm": 0.1547093689441681, "learning_rate": 0.00014312250945971392, "loss": 1.0004, "mean_token_accuracy": 0.7477412968873978, "num_tokens": 24359934.0, "step": 747 }, { "entropy": 0.9940600544214249, "epoch": 0.4, "grad_norm": 0.15820637345314026, "learning_rate": 0.0001429774521223402, "loss": 1.0192, "mean_token_accuracy": 0.7446351796388626, "num_tokens": 24392082.0, "step": 748 }, { "entropy": 0.9975368827581406, "epoch": 0.40053475935828875, "grad_norm": 0.14259840548038483, "learning_rate": 0.00014283229576537572, "loss": 0.9917, "mean_token_accuracy": 0.7497696876525879, "num_tokens": 24424720.0, "step": 749 }, { "entropy": 1.000609129667282, "epoch": 0.40106951871657753, "grad_norm": 0.13693387806415558, "learning_rate": 0.00014268704082467357, "loss": 0.9917, "mean_token_accuracy": 0.7509403973817825, "num_tokens": 24457378.0, "step": 750 }, { "entropy": 1.0345368534326553, "epoch": 0.4016042780748663, "grad_norm": 0.14850850403308868, "learning_rate": 0.00014254168773638278, "loss": 1.0117, "mean_token_accuracy": 0.7432490140199661, "num_tokens": 24490146.0, "step": 751 }, { "entropy": 1.0088206380605698, "epoch": 0.4021390374331551, "grad_norm": 0.13687895238399506, "learning_rate": 0.00014239623693694712, "loss": 0.9965, "mean_token_accuracy": 0.7491752207279205, "num_tokens": 24522914.0, "step": 752 }, { "entropy": 1.0217233449220657, "epoch": 0.4026737967914438, "grad_norm": 0.13860391080379486, "learning_rate": 0.00014225068886310375, "loss": 1.0085, "mean_token_accuracy": 0.7483198940753937, "num_tokens": 24555682.0, "step": 753 }, { "entropy": 0.9968471378087997, "epoch": 0.4032085561497326, "grad_norm": 0.14066515862941742, "learning_rate": 0.0001421050439518819, "loss": 0.9831, "mean_token_accuracy": 0.7497556209564209, "num_tokens": 24588450.0, "step": 754 }, { "entropy": 0.9938175976276398, "epoch": 0.4037433155080214, "grad_norm": 0.13824208080768585, "learning_rate": 0.00014195930264060159, "loss": 0.9947, "mean_token_accuracy": 0.7470369040966034, "num_tokens": 24621218.0, "step": 755 }, { "entropy": 1.0178113877773285, "epoch": 0.4042780748663102, "grad_norm": 0.13856786489486694, "learning_rate": 0.00014181346536687225, "loss": 1.012, "mean_token_accuracy": 0.7461252063512802, "num_tokens": 24653641.0, "step": 756 }, { "entropy": 0.997627466917038, "epoch": 0.4048128342245989, "grad_norm": 0.14167548716068268, "learning_rate": 0.00014166753256859152, "loss": 1.0032, "mean_token_accuracy": 0.7465094774961472, "num_tokens": 24686245.0, "step": 757 }, { "entropy": 0.9874056875705719, "epoch": 0.4053475935828877, "grad_norm": 0.1428968608379364, "learning_rate": 0.00014152150468394378, "loss": 0.9719, "mean_token_accuracy": 0.7541117370128632, "num_tokens": 24718792.0, "step": 758 }, { "entropy": 0.9934220016002655, "epoch": 0.40588235294117647, "grad_norm": 0.139199897646904, "learning_rate": 0.000141375382151399, "loss": 0.994, "mean_token_accuracy": 0.7510247379541397, "num_tokens": 24751530.0, "step": 759 }, { "entropy": 0.9941307455301285, "epoch": 0.40641711229946526, "grad_norm": 0.14097781479358673, "learning_rate": 0.0001412291654097113, "loss": 1.0032, "mean_token_accuracy": 0.7473887205123901, "num_tokens": 24784094.0, "step": 760 }, { "entropy": 1.0188378989696503, "epoch": 0.406951871657754, "grad_norm": 0.13915149867534637, "learning_rate": 0.00014108285489791768, "loss": 1.0281, "mean_token_accuracy": 0.7360606342554092, "num_tokens": 24816746.0, "step": 761 }, { "entropy": 1.0229030549526215, "epoch": 0.40748663101604277, "grad_norm": 0.14130914211273193, "learning_rate": 0.00014093645105533675, "loss": 1.0192, "mean_token_accuracy": 0.7431410849094391, "num_tokens": 24849356.0, "step": 762 }, { "entropy": 1.0327558815479279, "epoch": 0.40802139037433155, "grad_norm": 0.14207275211811066, "learning_rate": 0.00014078995432156724, "loss": 1.0427, "mean_token_accuracy": 0.7328323721885681, "num_tokens": 24882124.0, "step": 763 }, { "entropy": 0.997843861579895, "epoch": 0.40855614973262033, "grad_norm": 0.13978444039821625, "learning_rate": 0.00014064336513648697, "loss": 0.9981, "mean_token_accuracy": 0.7477700412273407, "num_tokens": 24914892.0, "step": 764 }, { "entropy": 0.9983325004577637, "epoch": 0.4090909090909091, "grad_norm": 0.14260488748550415, "learning_rate": 0.00014049668394025119, "loss": 0.9921, "mean_token_accuracy": 0.7530656903982162, "num_tokens": 24947529.0, "step": 765 }, { "entropy": 1.0412585139274597, "epoch": 0.40962566844919784, "grad_norm": 0.14486606419086456, "learning_rate": 0.00014034991117329156, "loss": 1.0233, "mean_token_accuracy": 0.7403512895107269, "num_tokens": 24979735.0, "step": 766 }, { "entropy": 0.9993823319673538, "epoch": 0.4101604278074866, "grad_norm": 0.15249890089035034, "learning_rate": 0.00014020304727631452, "loss": 0.9798, "mean_token_accuracy": 0.7502091825008392, "num_tokens": 25012457.0, "step": 767 }, { "entropy": 1.007173702120781, "epoch": 0.4106951871657754, "grad_norm": 0.16284959018230438, "learning_rate": 0.00014005609269030037, "loss": 1.009, "mean_token_accuracy": 0.7418438494205475, "num_tokens": 25045225.0, "step": 768 }, { "entropy": 1.006159782409668, "epoch": 0.4112299465240642, "grad_norm": 0.14431804418563843, "learning_rate": 0.00013990904785650158, "loss": 1.0189, "mean_token_accuracy": 0.746059387922287, "num_tokens": 25077993.0, "step": 769 }, { "entropy": 0.8995373249053955, "epoch": 0.4117647058823529, "grad_norm": 0.1489420235157013, "learning_rate": 0.00013976191321644161, "loss": 0.9115, "mean_token_accuracy": 0.7671065330505371, "num_tokens": 25110761.0, "step": 770 }, { "entropy": 1.0053518116474152, "epoch": 0.4122994652406417, "grad_norm": 0.1627763956785202, "learning_rate": 0.0001396146892119136, "loss": 1.0225, "mean_token_accuracy": 0.7457539141178131, "num_tokens": 25143529.0, "step": 771 }, { "entropy": 0.9979801774024963, "epoch": 0.4128342245989305, "grad_norm": 0.1462087333202362, "learning_rate": 0.000139467376284979, "loss": 1.017, "mean_token_accuracy": 0.7471773624420166, "num_tokens": 25176281.0, "step": 772 }, { "entropy": 1.0216413140296936, "epoch": 0.41336898395721927, "grad_norm": 0.14449453353881836, "learning_rate": 0.00013931997487796628, "loss": 1.0358, "mean_token_accuracy": 0.741568922996521, "num_tokens": 25209049.0, "step": 773 }, { "entropy": 0.9999716579914093, "epoch": 0.413903743315508, "grad_norm": 0.15826645493507385, "learning_rate": 0.00013917248543346956, "loss": 1.0139, "mean_token_accuracy": 0.7501832842826843, "num_tokens": 25241817.0, "step": 774 }, { "entropy": 1.0087449997663498, "epoch": 0.4144385026737968, "grad_norm": 0.14455418288707733, "learning_rate": 0.0001390249083943474, "loss": 0.9827, "mean_token_accuracy": 0.7504959404468536, "num_tokens": 25274306.0, "step": 775 }, { "entropy": 1.0708818435668945, "epoch": 0.41497326203208557, "grad_norm": 0.14365294575691223, "learning_rate": 0.0001388772442037212, "loss": 1.0512, "mean_token_accuracy": 0.7327451109886169, "num_tokens": 25306807.0, "step": 776 }, { "entropy": 1.000595673918724, "epoch": 0.41550802139037435, "grad_norm": 0.1471114158630371, "learning_rate": 0.00013872949330497416, "loss": 0.9777, "mean_token_accuracy": 0.7492363154888153, "num_tokens": 25339575.0, "step": 777 }, { "entropy": 1.0157749354839325, "epoch": 0.4160427807486631, "grad_norm": 0.13333575427532196, "learning_rate": 0.00013858165614174985, "loss": 0.9994, "mean_token_accuracy": 0.7480064779520035, "num_tokens": 25372306.0, "step": 778 }, { "entropy": 0.9878341257572174, "epoch": 0.41657754010695186, "grad_norm": 0.1525413691997528, "learning_rate": 0.00013843373315795086, "loss": 0.9778, "mean_token_accuracy": 0.7529974728822708, "num_tokens": 25404641.0, "step": 779 }, { "entropy": 1.0254554450511932, "epoch": 0.41711229946524064, "grad_norm": 0.17594583332538605, "learning_rate": 0.00013828572479773735, "loss": 1.0232, "mean_token_accuracy": 0.7407135963439941, "num_tokens": 25437409.0, "step": 780 }, { "entropy": 0.9847807586193085, "epoch": 0.4176470588235294, "grad_norm": 0.15318170189857483, "learning_rate": 0.00013813763150552602, "loss": 0.9991, "mean_token_accuracy": 0.7479104548692703, "num_tokens": 25469642.0, "step": 781 }, { "entropy": 0.9743617326021194, "epoch": 0.41818181818181815, "grad_norm": 0.13956089317798615, "learning_rate": 0.0001379894537259884, "loss": 0.9872, "mean_token_accuracy": 0.7450818568468094, "num_tokens": 25502410.0, "step": 782 }, { "entropy": 1.0024569183588028, "epoch": 0.41871657754010694, "grad_norm": 0.17088918387889862, "learning_rate": 0.0001378411919040499, "loss": 1.0059, "mean_token_accuracy": 0.743884950876236, "num_tokens": 25534753.0, "step": 783 }, { "entropy": 0.9946104288101196, "epoch": 0.4192513368983957, "grad_norm": 0.15087063610553741, "learning_rate": 0.00013769284648488807, "loss": 0.9915, "mean_token_accuracy": 0.7477570027112961, "num_tokens": 25567224.0, "step": 784 }, { "entropy": 1.005013570189476, "epoch": 0.4197860962566845, "grad_norm": 0.14750632643699646, "learning_rate": 0.00013754441791393167, "loss": 1.0233, "mean_token_accuracy": 0.7427409589290619, "num_tokens": 25599950.0, "step": 785 }, { "entropy": 1.0433073788881302, "epoch": 0.4203208556149733, "grad_norm": 0.158527672290802, "learning_rate": 0.00013739590663685902, "loss": 1.052, "mean_token_accuracy": 0.7364682704210281, "num_tokens": 25632429.0, "step": 786 }, { "entropy": 0.9824782460927963, "epoch": 0.420855614973262, "grad_norm": 0.1468294858932495, "learning_rate": 0.00013724731309959677, "loss": 1.0058, "mean_token_accuracy": 0.7483760863542557, "num_tokens": 25664811.0, "step": 787 }, { "entropy": 1.0482485890388489, "epoch": 0.4213903743315508, "grad_norm": 0.14201639592647552, "learning_rate": 0.00013709863774831864, "loss": 1.0458, "mean_token_accuracy": 0.7343597412109375, "num_tokens": 25697579.0, "step": 788 }, { "entropy": 0.9861636310815811, "epoch": 0.4219251336898396, "grad_norm": 0.1537942737340927, "learning_rate": 0.00013694988102944396, "loss": 0.9716, "mean_token_accuracy": 0.7534565180540085, "num_tokens": 25729923.0, "step": 789 }, { "entropy": 1.0346594899892807, "epoch": 0.42245989304812837, "grad_norm": 0.14477504789829254, "learning_rate": 0.00013680104338963643, "loss": 1.011, "mean_token_accuracy": 0.7462104111909866, "num_tokens": 25762085.0, "step": 790 }, { "entropy": 0.9832995384931564, "epoch": 0.4229946524064171, "grad_norm": 0.14770746231079102, "learning_rate": 0.00013665212527580258, "loss": 0.973, "mean_token_accuracy": 0.7535790205001831, "num_tokens": 25794308.0, "step": 791 }, { "entropy": 0.9789574444293976, "epoch": 0.4235294117647059, "grad_norm": 0.1468288004398346, "learning_rate": 0.00013650312713509076, "loss": 0.9663, "mean_token_accuracy": 0.757819414138794, "num_tokens": 25826744.0, "step": 792 }, { "entropy": 0.9459770172834396, "epoch": 0.42406417112299466, "grad_norm": 0.14010605216026306, "learning_rate": 0.00013635404941488954, "loss": 0.9378, "mean_token_accuracy": 0.7616658359766006, "num_tokens": 25859274.0, "step": 793 }, { "entropy": 0.9862398356199265, "epoch": 0.42459893048128344, "grad_norm": 0.15382592380046844, "learning_rate": 0.00013620489256282642, "loss": 1.0046, "mean_token_accuracy": 0.7458150088787079, "num_tokens": 25892042.0, "step": 794 }, { "entropy": 1.00739586353302, "epoch": 0.42513368983957217, "grad_norm": 0.1552584171295166, "learning_rate": 0.0001360556570267665, "loss": 1.0101, "mean_token_accuracy": 0.7430644780397415, "num_tokens": 25924536.0, "step": 795 }, { "entropy": 0.9220166951417923, "epoch": 0.42566844919786095, "grad_norm": 0.14095531404018402, "learning_rate": 0.0001359063432548111, "loss": 0.9278, "mean_token_accuracy": 0.7619739770889282, "num_tokens": 25956828.0, "step": 796 }, { "entropy": 1.0195706486701965, "epoch": 0.42620320855614974, "grad_norm": 0.15685130655765533, "learning_rate": 0.00013575695169529669, "loss": 1.0378, "mean_token_accuracy": 0.7380559742450714, "num_tokens": 25989596.0, "step": 797 }, { "entropy": 0.9775845110416412, "epoch": 0.4267379679144385, "grad_norm": 0.141723170876503, "learning_rate": 0.0001356074827967929, "loss": 0.9914, "mean_token_accuracy": 0.7492057681083679, "num_tokens": 26022364.0, "step": 798 }, { "entropy": 0.9945722669363022, "epoch": 0.42727272727272725, "grad_norm": 0.14415983855724335, "learning_rate": 0.00013545793700810204, "loss": 1.0042, "mean_token_accuracy": 0.7473423779010773, "num_tokens": 26055132.0, "step": 799 }, { "entropy": 0.9478647261857986, "epoch": 0.42780748663101603, "grad_norm": 0.1493312567472458, "learning_rate": 0.00013530831477825687, "loss": 0.9379, "mean_token_accuracy": 0.7591336667537689, "num_tokens": 26087900.0, "step": 800 }, { "entropy": 1.0211884379386902, "epoch": 0.4283422459893048, "grad_norm": 0.13690948486328125, "learning_rate": 0.00013515861655652, "loss": 1.0188, "mean_token_accuracy": 0.744165450334549, "num_tokens": 26120668.0, "step": 801 }, { "entropy": 1.0712903141975403, "epoch": 0.4288770053475936, "grad_norm": 0.15197446942329407, "learning_rate": 0.00013500884279238202, "loss": 1.0723, "mean_token_accuracy": 0.7357724756002426, "num_tokens": 26153171.0, "step": 802 }, { "entropy": 0.9801386594772339, "epoch": 0.4294117647058823, "grad_norm": 0.14681871235370636, "learning_rate": 0.00013485899393556042, "loss": 0.9791, "mean_token_accuracy": 0.750549852848053, "num_tokens": 26185939.0, "step": 803 }, { "entropy": 0.9867227226495743, "epoch": 0.4299465240641711, "grad_norm": 0.14551116526126862, "learning_rate": 0.0001347090704359982, "loss": 0.9803, "mean_token_accuracy": 0.7532755136489868, "num_tokens": 26218088.0, "step": 804 }, { "entropy": 1.046304628252983, "epoch": 0.4304812834224599, "grad_norm": 0.1528748720884323, "learning_rate": 0.00013455907274386244, "loss": 1.043, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 26250856.0, "step": 805 }, { "entropy": 0.9992060363292694, "epoch": 0.4310160427807487, "grad_norm": 0.14023494720458984, "learning_rate": 0.00013440900130954296, "loss": 0.9985, "mean_token_accuracy": 0.7464259564876556, "num_tokens": 26283624.0, "step": 806 }, { "entropy": 1.0101826190948486, "epoch": 0.43155080213903746, "grad_norm": 0.15695171058177948, "learning_rate": 0.0001342588565836511, "loss": 1.0149, "mean_token_accuracy": 0.7469891905784607, "num_tokens": 26315876.0, "step": 807 }, { "entropy": 1.025041863322258, "epoch": 0.4320855614973262, "grad_norm": 0.14769817888736725, "learning_rate": 0.00013410863901701817, "loss": 1.031, "mean_token_accuracy": 0.7399892508983612, "num_tokens": 26348516.0, "step": 808 }, { "entropy": 1.0198047310113907, "epoch": 0.43262032085561497, "grad_norm": 0.14085803925991058, "learning_rate": 0.00013395834906069424, "loss": 1.0278, "mean_token_accuracy": 0.7437599748373032, "num_tokens": 26381254.0, "step": 809 }, { "entropy": 1.055108368396759, "epoch": 0.43315508021390375, "grad_norm": 0.14935609698295593, "learning_rate": 0.00013380798716594675, "loss": 1.0716, "mean_token_accuracy": 0.7361881732940674, "num_tokens": 26413887.0, "step": 810 }, { "entropy": 0.9968017041683197, "epoch": 0.43368983957219254, "grad_norm": 0.13966552913188934, "learning_rate": 0.00013365755378425914, "loss": 0.9842, "mean_token_accuracy": 0.7523777335882187, "num_tokens": 26446328.0, "step": 811 }, { "entropy": 1.0102295130491257, "epoch": 0.43422459893048126, "grad_norm": 0.1441984623670578, "learning_rate": 0.00013350704936732947, "loss": 0.9868, "mean_token_accuracy": 0.750946968793869, "num_tokens": 26479096.0, "step": 812 }, { "entropy": 0.9585129767656326, "epoch": 0.43475935828877005, "grad_norm": 0.14310646057128906, "learning_rate": 0.00013335647436706912, "loss": 0.9537, "mean_token_accuracy": 0.759378045797348, "num_tokens": 26511864.0, "step": 813 }, { "entropy": 0.9890850186347961, "epoch": 0.43529411764705883, "grad_norm": 0.14099349081516266, "learning_rate": 0.00013320582923560142, "loss": 0.9891, "mean_token_accuracy": 0.7498167157173157, "num_tokens": 26544632.0, "step": 814 }, { "entropy": 0.9900194853544235, "epoch": 0.4358288770053476, "grad_norm": 0.16107913851737976, "learning_rate": 0.00013305511442526026, "loss": 0.9974, "mean_token_accuracy": 0.7471900135278702, "num_tokens": 26577280.0, "step": 815 }, { "entropy": 0.9850942343473434, "epoch": 0.43636363636363634, "grad_norm": 0.14523376524448395, "learning_rate": 0.0001329043303885887, "loss": 1.0001, "mean_token_accuracy": 0.7438905239105225, "num_tokens": 26610048.0, "step": 816 }, { "entropy": 1.0441436767578125, "epoch": 0.4368983957219251, "grad_norm": 0.15303680300712585, "learning_rate": 0.00013275347757833782, "loss": 1.0364, "mean_token_accuracy": 0.7361928671598434, "num_tokens": 26642730.0, "step": 817 }, { "entropy": 1.006567895412445, "epoch": 0.4374331550802139, "grad_norm": 0.14957407116889954, "learning_rate": 0.000132602556447465, "loss": 1.0212, "mean_token_accuracy": 0.7475256621837616, "num_tokens": 26675498.0, "step": 818 }, { "entropy": 0.9686824828386307, "epoch": 0.4379679144385027, "grad_norm": 0.1421763002872467, "learning_rate": 0.0001324515674491329, "loss": 0.9715, "mean_token_accuracy": 0.7550385147333145, "num_tokens": 26707685.0, "step": 819 }, { "entropy": 0.9856327921152115, "epoch": 0.4385026737967914, "grad_norm": 0.14262866973876953, "learning_rate": 0.00013230051103670792, "loss": 0.9634, "mean_token_accuracy": 0.7571175396442413, "num_tokens": 26740453.0, "step": 820 }, { "entropy": 1.0294437855482101, "epoch": 0.4390374331550802, "grad_norm": 0.13774727284908295, "learning_rate": 0.0001321493876637589, "loss": 1.007, "mean_token_accuracy": 0.7464000433683395, "num_tokens": 26773101.0, "step": 821 }, { "entropy": 1.0105653703212738, "epoch": 0.439572192513369, "grad_norm": 0.14089708030223846, "learning_rate": 0.00013199819778405569, "loss": 0.9933, "mean_token_accuracy": 0.7512582391500473, "num_tokens": 26805686.0, "step": 822 }, { "entropy": 1.0349905490875244, "epoch": 0.44010695187165777, "grad_norm": 0.1354954093694687, "learning_rate": 0.0001318469418515679, "loss": 1.0276, "mean_token_accuracy": 0.7418626844882965, "num_tokens": 26838265.0, "step": 823 }, { "entropy": 0.9851645231246948, "epoch": 0.4406417112299465, "grad_norm": 0.14308570325374603, "learning_rate": 0.0001316956203204634, "loss": 0.9748, "mean_token_accuracy": 0.7493155598640442, "num_tokens": 26870521.0, "step": 824 }, { "entropy": 1.0248220413923264, "epoch": 0.4411764705882353, "grad_norm": 0.15384997427463531, "learning_rate": 0.0001315442336451071, "loss": 1.0232, "mean_token_accuracy": 0.74516561627388, "num_tokens": 26903232.0, "step": 825 }, { "entropy": 0.9613745510578156, "epoch": 0.44171122994652406, "grad_norm": 0.14238059520721436, "learning_rate": 0.00013139278228005947, "loss": 0.9549, "mean_token_accuracy": 0.7580950558185577, "num_tokens": 26936000.0, "step": 826 }, { "entropy": 0.9351648837327957, "epoch": 0.44224598930481285, "grad_norm": 0.16106504201889038, "learning_rate": 0.00013124126668007523, "loss": 0.9674, "mean_token_accuracy": 0.7571175396442413, "num_tokens": 26968768.0, "step": 827 }, { "entropy": 0.9881116300821304, "epoch": 0.44278074866310163, "grad_norm": 0.15062279999256134, "learning_rate": 0.00013108968730010204, "loss": 0.9876, "mean_token_accuracy": 0.7498167902231216, "num_tokens": 27001383.0, "step": 828 }, { "entropy": 0.9285563677549362, "epoch": 0.44331550802139036, "grad_norm": 0.14957569539546967, "learning_rate": 0.00013093804459527893, "loss": 0.9313, "mean_token_accuracy": 0.7575452029705048, "num_tokens": 27034151.0, "step": 829 }, { "entropy": 0.9487057030200958, "epoch": 0.44385026737967914, "grad_norm": 0.15658754110336304, "learning_rate": 0.0001307863390209352, "loss": 0.9504, "mean_token_accuracy": 0.7548264861106873, "num_tokens": 27066919.0, "step": 830 }, { "entropy": 1.002123698592186, "epoch": 0.4443850267379679, "grad_norm": 0.1467520296573639, "learning_rate": 0.0001306345710325888, "loss": 1.0041, "mean_token_accuracy": 0.7428213655948639, "num_tokens": 27099687.0, "step": 831 }, { "entropy": 0.9975673407316208, "epoch": 0.4449197860962567, "grad_norm": 0.15837520360946655, "learning_rate": 0.0001304827410859453, "loss": 1.0, "mean_token_accuracy": 0.7424547970294952, "num_tokens": 27132455.0, "step": 832 }, { "entropy": 0.9692659825086594, "epoch": 0.44545454545454544, "grad_norm": 0.14246657490730286, "learning_rate": 0.00013033084963689603, "loss": 0.9597, "mean_token_accuracy": 0.7527073472738266, "num_tokens": 27165037.0, "step": 833 }, { "entropy": 1.0118547677993774, "epoch": 0.4459893048128342, "grad_norm": 0.1440034955739975, "learning_rate": 0.00013017889714151724, "loss": 0.9975, "mean_token_accuracy": 0.7490766048431396, "num_tokens": 27197774.0, "step": 834 }, { "entropy": 0.9778098613023758, "epoch": 0.446524064171123, "grad_norm": 0.15016303956508636, "learning_rate": 0.00013002688405606828, "loss": 0.9959, "mean_token_accuracy": 0.7487170100212097, "num_tokens": 27230542.0, "step": 835 }, { "entropy": 0.9894908368587494, "epoch": 0.4470588235294118, "grad_norm": 0.13868270814418793, "learning_rate": 0.00012987481083699063, "loss": 0.9977, "mean_token_accuracy": 0.7494760751724243, "num_tokens": 27263147.0, "step": 836 }, { "entropy": 0.9909839183092117, "epoch": 0.4475935828877005, "grad_norm": 0.14243672788143158, "learning_rate": 0.00012972267794090613, "loss": 0.9888, "mean_token_accuracy": 0.7501832842826843, "num_tokens": 27295915.0, "step": 837 }, { "entropy": 0.9736573696136475, "epoch": 0.4481283422459893, "grad_norm": 0.1431356966495514, "learning_rate": 0.0001295704858246159, "loss": 0.9723, "mean_token_accuracy": 0.7547959387302399, "num_tokens": 27328683.0, "step": 838 }, { "entropy": 0.9292502403259277, "epoch": 0.4486631016042781, "grad_norm": 0.1411665827035904, "learning_rate": 0.00012941823494509898, "loss": 0.9282, "mean_token_accuracy": 0.7612108886241913, "num_tokens": 27361451.0, "step": 839 }, { "entropy": 1.0365600287914276, "epoch": 0.44919786096256686, "grad_norm": 0.13857612013816833, "learning_rate": 0.00012926592575951065, "loss": 1.0261, "mean_token_accuracy": 0.7461815774440765, "num_tokens": 27394219.0, "step": 840 }, { "entropy": 0.9583263546228409, "epoch": 0.4497326203208556, "grad_norm": 0.1433681845664978, "learning_rate": 0.00012911355872518142, "loss": 0.9393, "mean_token_accuracy": 0.7587670981884003, "num_tokens": 27426987.0, "step": 841 }, { "entropy": 1.020112156867981, "epoch": 0.4502673796791444, "grad_norm": 0.14346186816692352, "learning_rate": 0.00012896113429961535, "loss": 1.0132, "mean_token_accuracy": 0.7436766922473907, "num_tokens": 27459755.0, "step": 842 }, { "entropy": 1.013671264052391, "epoch": 0.45080213903743316, "grad_norm": 0.14903905987739563, "learning_rate": 0.0001288086529404891, "loss": 1.001, "mean_token_accuracy": 0.7490814626216888, "num_tokens": 27492481.0, "step": 843 }, { "entropy": 0.9899970591068268, "epoch": 0.45133689839572194, "grad_norm": 0.1441020518541336, "learning_rate": 0.00012865611510564994, "loss": 0.9945, "mean_token_accuracy": 0.7511302530765533, "num_tokens": 27525249.0, "step": 844 }, { "entropy": 0.9621583074331284, "epoch": 0.45187165775401067, "grad_norm": 0.15059061348438263, "learning_rate": 0.00012850352125311494, "loss": 0.9645, "mean_token_accuracy": 0.7551609575748444, "num_tokens": 27557373.0, "step": 845 }, { "entropy": 0.9534913152456284, "epoch": 0.45240641711229945, "grad_norm": 0.1444634199142456, "learning_rate": 0.00012835087184106934, "loss": 0.9622, "mean_token_accuracy": 0.7565398961305618, "num_tokens": 27589801.0, "step": 846 }, { "entropy": 0.9190541058778763, "epoch": 0.45294117647058824, "grad_norm": 0.15014390647411346, "learning_rate": 0.00012819816732786513, "loss": 0.9335, "mean_token_accuracy": 0.7609359622001648, "num_tokens": 27622569.0, "step": 847 }, { "entropy": 0.9643352031707764, "epoch": 0.453475935828877, "grad_norm": 0.1724693328142166, "learning_rate": 0.0001280454081720198, "loss": 0.9732, "mean_token_accuracy": 0.7566371709108353, "num_tokens": 27655309.0, "step": 848 }, { "entropy": 1.0093938112258911, "epoch": 0.4540106951871658, "grad_norm": 0.15161548554897308, "learning_rate": 0.00012789259483221493, "loss": 1.0115, "mean_token_accuracy": 0.7467968165874481, "num_tokens": 27687970.0, "step": 849 }, { "entropy": 0.9799563735723495, "epoch": 0.45454545454545453, "grad_norm": 0.14368638396263123, "learning_rate": 0.00012773972776729476, "loss": 0.9727, "mean_token_accuracy": 0.7515579164028168, "num_tokens": 27720738.0, "step": 850 }, { "entropy": 1.012539565563202, "epoch": 0.4550802139037433, "grad_norm": 0.15072520077228546, "learning_rate": 0.00012758680743626483, "loss": 1.0122, "mean_token_accuracy": 0.74500472843647, "num_tokens": 27753417.0, "step": 851 }, { "entropy": 0.9671606421470642, "epoch": 0.4556149732620321, "grad_norm": 0.1450292468070984, "learning_rate": 0.00012743383429829073, "loss": 0.9603, "mean_token_accuracy": 0.7547348439693451, "num_tokens": 27786185.0, "step": 852 }, { "entropy": 0.966804251074791, "epoch": 0.4561497326203209, "grad_norm": 0.1427607387304306, "learning_rate": 0.0001272808088126964, "loss": 0.9497, "mean_token_accuracy": 0.7570531070232391, "num_tokens": 27818856.0, "step": 853 }, { "entropy": 1.018742486834526, "epoch": 0.4566844919786096, "grad_norm": 0.1700286716222763, "learning_rate": 0.00012712773143896326, "loss": 1.0105, "mean_token_accuracy": 0.7449291348457336, "num_tokens": 27851624.0, "step": 854 }, { "entropy": 1.0010544210672379, "epoch": 0.4572192513368984, "grad_norm": 0.15574532747268677, "learning_rate": 0.00012697460263672822, "loss": 1.0087, "mean_token_accuracy": 0.7441959977149963, "num_tokens": 27884392.0, "step": 855 }, { "entropy": 0.9586425423622131, "epoch": 0.4577540106951872, "grad_norm": 0.14490839838981628, "learning_rate": 0.0001268214228657828, "loss": 0.9636, "mean_token_accuracy": 0.7560789287090302, "num_tokens": 27917160.0, "step": 856 }, { "entropy": 0.9910256862640381, "epoch": 0.45828877005347596, "grad_norm": 0.16581670939922333, "learning_rate": 0.00012666819258607158, "loss": 1.0053, "mean_token_accuracy": 0.7519976049661636, "num_tokens": 27949828.0, "step": 857 }, { "entropy": 0.9686436355113983, "epoch": 0.4588235294117647, "grad_norm": 0.15091009438037872, "learning_rate": 0.0001265149122576906, "loss": 0.9813, "mean_token_accuracy": 0.7518328428268433, "num_tokens": 27982596.0, "step": 858 }, { "entropy": 0.9729765504598618, "epoch": 0.45935828877005347, "grad_norm": 0.15223021805286407, "learning_rate": 0.0001263615823408864, "loss": 0.9776, "mean_token_accuracy": 0.7539650201797485, "num_tokens": 28014505.0, "step": 859 }, { "entropy": 0.9859947264194489, "epoch": 0.45989304812834225, "grad_norm": 0.1753186583518982, "learning_rate": 0.00012620820329605433, "loss": 0.9838, "mean_token_accuracy": 0.7529747784137726, "num_tokens": 28046971.0, "step": 860 }, { "entropy": 1.02519690990448, "epoch": 0.46042780748663104, "grad_norm": 0.15426649153232574, "learning_rate": 0.00012605477558373727, "loss": 1.0202, "mean_token_accuracy": 0.7468536198139191, "num_tokens": 28079739.0, "step": 861 }, { "entropy": 0.9805189371109009, "epoch": 0.46096256684491976, "grad_norm": 0.15347301959991455, "learning_rate": 0.00012590129966462418, "loss": 0.9741, "mean_token_accuracy": 0.7568426132202148, "num_tokens": 28112507.0, "step": 862 }, { "entropy": 0.9748925268650055, "epoch": 0.46149732620320855, "grad_norm": 0.16348502039909363, "learning_rate": 0.0001257477759995488, "loss": 0.9668, "mean_token_accuracy": 0.7535220235586166, "num_tokens": 28144969.0, "step": 863 }, { "entropy": 1.0007185190916061, "epoch": 0.46203208556149733, "grad_norm": 0.13907505571842194, "learning_rate": 0.00012559420504948827, "loss": 1.0023, "mean_token_accuracy": 0.7492653876543045, "num_tokens": 28177395.0, "step": 864 }, { "entropy": 0.97891004383564, "epoch": 0.4625668449197861, "grad_norm": 0.16052573919296265, "learning_rate": 0.00012544058727556161, "loss": 0.9777, "mean_token_accuracy": 0.7531695663928986, "num_tokens": 28209795.0, "step": 865 }, { "entropy": 0.9599785953760147, "epoch": 0.46310160427807484, "grad_norm": 0.1533554643392563, "learning_rate": 0.00012528692313902858, "loss": 0.9597, "mean_token_accuracy": 0.7586449086666107, "num_tokens": 28242563.0, "step": 866 }, { "entropy": 0.9907165765762329, "epoch": 0.4636363636363636, "grad_norm": 0.14271271228790283, "learning_rate": 0.00012513321310128804, "loss": 0.983, "mean_token_accuracy": 0.750079333782196, "num_tokens": 28274945.0, "step": 867 }, { "entropy": 1.0088491588830948, "epoch": 0.4641711229946524, "grad_norm": 0.1458507478237152, "learning_rate": 0.00012497945762387672, "loss": 1.0089, "mean_token_accuracy": 0.7421345859766006, "num_tokens": 28307272.0, "step": 868 }, { "entropy": 1.0230762511491776, "epoch": 0.4647058823529412, "grad_norm": 0.14179329574108124, "learning_rate": 0.00012482565716846774, "loss": 1.0319, "mean_token_accuracy": 0.743267372250557, "num_tokens": 28339540.0, "step": 869 }, { "entropy": 1.0096471905708313, "epoch": 0.46524064171123, "grad_norm": 0.14528143405914307, "learning_rate": 0.00012467181219686936, "loss": 0.9995, "mean_token_accuracy": 0.7422715127468109, "num_tokens": 28372308.0, "step": 870 }, { "entropy": 1.0115074217319489, "epoch": 0.4657754010695187, "grad_norm": 0.14259186387062073, "learning_rate": 0.00012451792317102345, "loss": 1.0178, "mean_token_accuracy": 0.7431919872760773, "num_tokens": 28405028.0, "step": 871 }, { "entropy": 1.0354815274477005, "epoch": 0.4663101604278075, "grad_norm": 0.15655957162380219, "learning_rate": 0.00012436399055300415, "loss": 1.0393, "mean_token_accuracy": 0.739911362528801, "num_tokens": 28437386.0, "step": 872 }, { "entropy": 1.028480052947998, "epoch": 0.46684491978609627, "grad_norm": 0.14866073429584503, "learning_rate": 0.00012421001480501648, "loss": 1.0383, "mean_token_accuracy": 0.7398970276117325, "num_tokens": 28469968.0, "step": 873 }, { "entropy": 0.972516804933548, "epoch": 0.46737967914438505, "grad_norm": 0.1423788070678711, "learning_rate": 0.00012405599638939501, "loss": 0.9601, "mean_token_accuracy": 0.7564070969820023, "num_tokens": 28502628.0, "step": 874 }, { "entropy": 1.001707598567009, "epoch": 0.4679144385026738, "grad_norm": 0.15020547807216644, "learning_rate": 0.0001239019357686025, "loss": 0.9869, "mean_token_accuracy": 0.7530111223459244, "num_tokens": 28534961.0, "step": 875 }, { "entropy": 1.0281101614236832, "epoch": 0.46844919786096256, "grad_norm": 0.1444360762834549, "learning_rate": 0.00012374783340522816, "loss": 1.0178, "mean_token_accuracy": 0.7450513243675232, "num_tokens": 28567729.0, "step": 876 }, { "entropy": 0.9586435705423355, "epoch": 0.46898395721925135, "grad_norm": 0.14336393773555756, "learning_rate": 0.0001235936897619869, "loss": 0.95, "mean_token_accuracy": 0.7565875798463821, "num_tokens": 28600231.0, "step": 877 }, { "entropy": 0.9826674312353134, "epoch": 0.46951871657754013, "grad_norm": 0.14744499325752258, "learning_rate": 0.0001234395053017173, "loss": 0.975, "mean_token_accuracy": 0.7538526058197021, "num_tokens": 28632909.0, "step": 878 }, { "entropy": 0.9591492861509323, "epoch": 0.47005347593582886, "grad_norm": 0.14284496009349823, "learning_rate": 0.00012328528048738063, "loss": 0.9697, "mean_token_accuracy": 0.7575547695159912, "num_tokens": 28665573.0, "step": 879 }, { "entropy": 1.0010185837745667, "epoch": 0.47058823529411764, "grad_norm": 0.14766953885555267, "learning_rate": 0.00012313101578205932, "loss": 1.0112, "mean_token_accuracy": 0.7468580156564713, "num_tokens": 28697136.0, "step": 880 }, { "entropy": 0.9989901632070541, "epoch": 0.4711229946524064, "grad_norm": 0.14356346428394318, "learning_rate": 0.0001229767116489555, "loss": 0.9938, "mean_token_accuracy": 0.7492057681083679, "num_tokens": 28729904.0, "step": 881 }, { "entropy": 0.9695397615432739, "epoch": 0.4716577540106952, "grad_norm": 0.14482583105564117, "learning_rate": 0.00012282236855138982, "loss": 0.9823, "mean_token_accuracy": 0.7534518539905548, "num_tokens": 28762672.0, "step": 882 }, { "entropy": 0.9446674883365631, "epoch": 0.47219251336898393, "grad_norm": 0.15149655938148499, "learning_rate": 0.00012266798695279978, "loss": 0.9578, "mean_token_accuracy": 0.7553152441978455, "num_tokens": 28795440.0, "step": 883 }, { "entropy": 0.9982113689184189, "epoch": 0.4727272727272727, "grad_norm": 0.149799183011055, "learning_rate": 0.00012251356731673866, "loss": 0.9681, "mean_token_accuracy": 0.7516845017671585, "num_tokens": 28827741.0, "step": 884 }, { "entropy": 1.0306490808725357, "epoch": 0.4732620320855615, "grad_norm": 0.15076981484889984, "learning_rate": 0.00012235911010687373, "loss": 1.039, "mean_token_accuracy": 0.7403458207845688, "num_tokens": 28859822.0, "step": 885 }, { "entropy": 1.0157879292964935, "epoch": 0.4737967914438503, "grad_norm": 0.14644651114940643, "learning_rate": 0.00012220461578698526, "loss": 1.0187, "mean_token_accuracy": 0.7441051155328751, "num_tokens": 28892138.0, "step": 886 }, { "entropy": 1.0000844895839691, "epoch": 0.474331550802139, "grad_norm": 0.14986386895179749, "learning_rate": 0.00012205008482096487, "loss": 1.0087, "mean_token_accuracy": 0.7471285462379456, "num_tokens": 28924906.0, "step": 887 }, { "entropy": 0.9762436598539352, "epoch": 0.4748663101604278, "grad_norm": 0.15048319101333618, "learning_rate": 0.00012189551767281427, "loss": 0.9881, "mean_token_accuracy": 0.74590665102005, "num_tokens": 28957674.0, "step": 888 }, { "entropy": 0.9624285250902176, "epoch": 0.4754010695187166, "grad_norm": 0.1492486149072647, "learning_rate": 0.00012174091480664376, "loss": 0.9728, "mean_token_accuracy": 0.7519520968198776, "num_tokens": 28990390.0, "step": 889 }, { "entropy": 1.0277507901191711, "epoch": 0.47593582887700536, "grad_norm": 0.1422109156847, "learning_rate": 0.00012158627668667091, "loss": 1.0284, "mean_token_accuracy": 0.7427111566066742, "num_tokens": 29023106.0, "step": 890 }, { "entropy": 1.029417023062706, "epoch": 0.4764705882352941, "grad_norm": 0.14125901460647583, "learning_rate": 0.0001214316037772191, "loss": 1.0008, "mean_token_accuracy": 0.7472001910209656, "num_tokens": 29055779.0, "step": 891 }, { "entropy": 0.9816141426563263, "epoch": 0.4770053475935829, "grad_norm": 0.1472245454788208, "learning_rate": 0.00012127689654271626, "loss": 0.9673, "mean_token_accuracy": 0.7555326372385025, "num_tokens": 29088386.0, "step": 892 }, { "entropy": 0.9743991792201996, "epoch": 0.47754010695187166, "grad_norm": 0.14505396783351898, "learning_rate": 0.00012112215544769332, "loss": 0.9704, "mean_token_accuracy": 0.75552998483181, "num_tokens": 29121084.0, "step": 893 }, { "entropy": 0.984330803155899, "epoch": 0.47807486631016044, "grad_norm": 0.14463602006435394, "learning_rate": 0.00012096738095678287, "loss": 0.9661, "mean_token_accuracy": 0.7528714537620544, "num_tokens": 29153852.0, "step": 894 }, { "entropy": 0.9667259305715561, "epoch": 0.4786096256684492, "grad_norm": 0.14450709521770477, "learning_rate": 0.00012081257353471784, "loss": 0.9738, "mean_token_accuracy": 0.7543333023786545, "num_tokens": 29186546.0, "step": 895 }, { "entropy": 0.9456564784049988, "epoch": 0.47914438502673795, "grad_norm": 0.14816603064537048, "learning_rate": 0.00012065773364633001, "loss": 0.9543, "mean_token_accuracy": 0.7598668038845062, "num_tokens": 29219314.0, "step": 896 }, { "entropy": 0.9788111746311188, "epoch": 0.47967914438502673, "grad_norm": 0.17237494885921478, "learning_rate": 0.00012050286175654861, "loss": 0.9996, "mean_token_accuracy": 0.7479639053344727, "num_tokens": 29251749.0, "step": 897 }, { "entropy": 0.9980193078517914, "epoch": 0.4802139037433155, "grad_norm": 0.1541370004415512, "learning_rate": 0.000120347958330399, "loss": 1.0037, "mean_token_accuracy": 0.7468230724334717, "num_tokens": 29284517.0, "step": 898 }, { "entropy": 0.9955044686794281, "epoch": 0.4807486631016043, "grad_norm": 0.15029774606227875, "learning_rate": 0.00012019302383300121, "loss": 0.9974, "mean_token_accuracy": 0.7520772218704224, "num_tokens": 29317285.0, "step": 899 }, { "entropy": 0.9913774579763412, "epoch": 0.48128342245989303, "grad_norm": 0.1621326059103012, "learning_rate": 0.00012003805872956863, "loss": 1.0074, "mean_token_accuracy": 0.7462121248245239, "num_tokens": 29350053.0, "step": 900 }, { "entropy": 0.9967214167118073, "epoch": 0.4818181818181818, "grad_norm": 0.17531584203243256, "learning_rate": 0.00011988306348540642, "loss": 1.0158, "mean_token_accuracy": 0.7427083104848862, "num_tokens": 29382226.0, "step": 901 }, { "entropy": 1.0519507229328156, "epoch": 0.4823529411764706, "grad_norm": 0.16274113953113556, "learning_rate": 0.00011972803856591036, "loss": 1.069, "mean_token_accuracy": 0.7385945022106171, "num_tokens": 29414591.0, "step": 902 }, { "entropy": 1.022826910018921, "epoch": 0.4828877005347594, "grad_norm": 0.14667034149169922, "learning_rate": 0.00011957298443656528, "loss": 1.0044, "mean_token_accuracy": 0.746406689286232, "num_tokens": 29447242.0, "step": 903 }, { "entropy": 1.0191742479801178, "epoch": 0.4834224598930481, "grad_norm": 0.1471247673034668, "learning_rate": 0.0001194179015629437, "loss": 1.0026, "mean_token_accuracy": 0.7468249201774597, "num_tokens": 29479897.0, "step": 904 }, { "entropy": 1.0249872356653214, "epoch": 0.4839572192513369, "grad_norm": 0.14551971852779388, "learning_rate": 0.00011926279041070452, "loss": 1.0099, "mean_token_accuracy": 0.7473421692848206, "num_tokens": 29512280.0, "step": 905 }, { "entropy": 0.9875836968421936, "epoch": 0.4844919786096257, "grad_norm": 0.14124327898025513, "learning_rate": 0.0001191076514455915, "loss": 0.9656, "mean_token_accuracy": 0.7548264861106873, "num_tokens": 29545048.0, "step": 906 }, { "entropy": 1.0192315578460693, "epoch": 0.48502673796791446, "grad_norm": 0.14510181546211243, "learning_rate": 0.00011895248513343193, "loss": 1.0005, "mean_token_accuracy": 0.7453512996435165, "num_tokens": 29577383.0, "step": 907 }, { "entropy": 0.9869286417961121, "epoch": 0.4855614973262032, "grad_norm": 0.16035196185112, "learning_rate": 0.0001187972919401352, "loss": 0.9887, "mean_token_accuracy": 0.7483401149511337, "num_tokens": 29609831.0, "step": 908 }, { "entropy": 0.9757640361785889, "epoch": 0.48609625668449197, "grad_norm": 0.17180685698986053, "learning_rate": 0.00011864207233169136, "loss": 0.9699, "mean_token_accuracy": 0.7536656856536865, "num_tokens": 29642599.0, "step": 909 }, { "entropy": 1.001309648156166, "epoch": 0.48663101604278075, "grad_norm": 0.15699726343154907, "learning_rate": 0.00011848682677416994, "loss": 0.9882, "mean_token_accuracy": 0.75, "num_tokens": 29675367.0, "step": 910 }, { "entropy": 0.9576734304428101, "epoch": 0.48716577540106953, "grad_norm": 0.1499813348054886, "learning_rate": 0.00011833155573371815, "loss": 0.9598, "mean_token_accuracy": 0.7557627856731415, "num_tokens": 29707958.0, "step": 911 }, { "entropy": 1.0111593008041382, "epoch": 0.48770053475935826, "grad_norm": 0.14601005613803864, "learning_rate": 0.00011817625967655996, "loss": 1.0223, "mean_token_accuracy": 0.7433928549289703, "num_tokens": 29740374.0, "step": 912 }, { "entropy": 0.9894819855690002, "epoch": 0.48823529411764705, "grad_norm": 0.1447693109512329, "learning_rate": 0.00011802093906899432, "loss": 1.0035, "mean_token_accuracy": 0.7454178929328918, "num_tokens": 29773142.0, "step": 913 }, { "entropy": 1.0526953637599945, "epoch": 0.48877005347593583, "grad_norm": 0.15337249636650085, "learning_rate": 0.00011786559437739386, "loss": 1.0696, "mean_token_accuracy": 0.7307856976985931, "num_tokens": 29805910.0, "step": 914 }, { "entropy": 1.005323514342308, "epoch": 0.4893048128342246, "grad_norm": 0.1650882065296173, "learning_rate": 0.00011771022606820364, "loss": 1.0051, "mean_token_accuracy": 0.7460218220949173, "num_tokens": 29838478.0, "step": 915 }, { "entropy": 1.0439465939998627, "epoch": 0.4898395721925134, "grad_norm": 0.15761952102184296, "learning_rate": 0.00011755483460793954, "loss": 1.0571, "mean_token_accuracy": 0.7354590445756912, "num_tokens": 29871196.0, "step": 916 }, { "entropy": 1.0409222543239594, "epoch": 0.4903743315508021, "grad_norm": 0.14647220075130463, "learning_rate": 0.00011739942046318703, "loss": 1.0447, "mean_token_accuracy": 0.7362842261791229, "num_tokens": 29903964.0, "step": 917 }, { "entropy": 0.9852460622787476, "epoch": 0.4909090909090909, "grad_norm": 0.15946930646896362, "learning_rate": 0.00011724398410059962, "loss": 0.9707, "mean_token_accuracy": 0.7496487200260162, "num_tokens": 29936309.0, "step": 918 }, { "entropy": 1.020201325416565, "epoch": 0.4914438502673797, "grad_norm": 0.17865873873233795, "learning_rate": 0.0001170885259868976, "loss": 0.9944, "mean_token_accuracy": 0.747556209564209, "num_tokens": 29969077.0, "step": 919 }, { "entropy": 0.9647395312786102, "epoch": 0.4919786096256685, "grad_norm": 0.14664915204048157, "learning_rate": 0.00011693304658886654, "loss": 0.955, "mean_token_accuracy": 0.759252518415451, "num_tokens": 30001668.0, "step": 920 }, { "entropy": 1.0127326548099518, "epoch": 0.4925133689839572, "grad_norm": 0.16004888713359833, "learning_rate": 0.00011677754637335591, "loss": 1.0018, "mean_token_accuracy": 0.7477956712245941, "num_tokens": 30034131.0, "step": 921 }, { "entropy": 0.998745933175087, "epoch": 0.493048128342246, "grad_norm": 0.17857477068901062, "learning_rate": 0.0001166220258072777, "loss": 1.0134, "mean_token_accuracy": 0.7447153031826019, "num_tokens": 30066899.0, "step": 922 }, { "entropy": 0.9973787516355515, "epoch": 0.49358288770053477, "grad_norm": 0.14939211308956146, "learning_rate": 0.00011646648535760498, "loss": 1.0064, "mean_token_accuracy": 0.7462234795093536, "num_tokens": 30099318.0, "step": 923 }, { "entropy": 1.0201772600412369, "epoch": 0.49411764705882355, "grad_norm": 0.19249583780765533, "learning_rate": 0.00011631092549137057, "loss": 1.0283, "mean_token_accuracy": 0.7428824603557587, "num_tokens": 30132086.0, "step": 924 }, { "entropy": 1.0128138363361359, "epoch": 0.4946524064171123, "grad_norm": 0.1913338303565979, "learning_rate": 0.0001161553466756656, "loss": 1.0195, "mean_token_accuracy": 0.7421518564224243, "num_tokens": 30164631.0, "step": 925 }, { "entropy": 0.9628503024578094, "epoch": 0.49518716577540106, "grad_norm": 0.15156152844429016, "learning_rate": 0.00011599974937763799, "loss": 0.9489, "mean_token_accuracy": 0.7556818127632141, "num_tokens": 30197399.0, "step": 926 }, { "entropy": 1.0384571850299835, "epoch": 0.49572192513368984, "grad_norm": 0.20053574442863464, "learning_rate": 0.00011584413406449128, "loss": 1.0471, "mean_token_accuracy": 0.7360357940196991, "num_tokens": 30230029.0, "step": 927 }, { "entropy": 0.9797585010528564, "epoch": 0.49625668449197863, "grad_norm": 0.14603500068187714, "learning_rate": 0.00011568850120348306, "loss": 0.9685, "mean_token_accuracy": 0.7514859288930893, "num_tokens": 30262449.0, "step": 928 }, { "entropy": 0.9807269275188446, "epoch": 0.49679144385026736, "grad_norm": 0.1834476888179779, "learning_rate": 0.00011553285126192355, "loss": 0.9852, "mean_token_accuracy": 0.7513086497783661, "num_tokens": 30294775.0, "step": 929 }, { "entropy": 1.0138742625713348, "epoch": 0.49732620320855614, "grad_norm": 0.1444700062274933, "learning_rate": 0.00011537718470717438, "loss": 0.9956, "mean_token_accuracy": 0.7464201748371124, "num_tokens": 30327479.0, "step": 930 }, { "entropy": 0.9629910737276077, "epoch": 0.4978609625668449, "grad_norm": 0.18388797342777252, "learning_rate": 0.00011522150200664695, "loss": 0.9479, "mean_token_accuracy": 0.7568120658397675, "num_tokens": 30360247.0, "step": 931 }, { "entropy": 0.987065315246582, "epoch": 0.4983957219251337, "grad_norm": 0.14784592390060425, "learning_rate": 0.00011506580362780121, "loss": 0.9769, "mean_token_accuracy": 0.7503300607204437, "num_tokens": 30392876.0, "step": 932 }, { "entropy": 0.9994514882564545, "epoch": 0.49893048128342243, "grad_norm": 0.18403013050556183, "learning_rate": 0.00011491009003814417, "loss": 0.9884, "mean_token_accuracy": 0.7486559152603149, "num_tokens": 30425644.0, "step": 933 }, { "entropy": 0.9785473197698593, "epoch": 0.4994652406417112, "grad_norm": 0.163219153881073, "learning_rate": 0.00011475436170522842, "loss": 0.9865, "mean_token_accuracy": 0.7483198940753937, "num_tokens": 30458412.0, "step": 934 }, { "entropy": 0.9718093425035477, "epoch": 0.5, "grad_norm": 0.1547645777463913, "learning_rate": 0.00011459861909665106, "loss": 0.9693, "mean_token_accuracy": 0.7519071251153946, "num_tokens": 30491058.0, "step": 935 }, { "entropy": 0.9814210534095764, "epoch": 0.5005347593582887, "grad_norm": 0.1539863497018814, "learning_rate": 0.00011444286268005173, "loss": 0.9738, "mean_token_accuracy": 0.751496821641922, "num_tokens": 30523826.0, "step": 936 }, { "entropy": 0.924762949347496, "epoch": 0.5010695187165776, "grad_norm": 0.15415582060813904, "learning_rate": 0.00011428709292311179, "loss": 0.9331, "mean_token_accuracy": 0.7617912888526917, "num_tokens": 30556594.0, "step": 937 }, { "entropy": 0.9639741778373718, "epoch": 0.5016042780748663, "grad_norm": 0.16413013637065887, "learning_rate": 0.0001141313102935526, "loss": 0.9578, "mean_token_accuracy": 0.7549447417259216, "num_tokens": 30588411.0, "step": 938 }, { "entropy": 0.9839192181825638, "epoch": 0.5021390374331551, "grad_norm": 0.15187539160251617, "learning_rate": 0.00011397551525913403, "loss": 0.9943, "mean_token_accuracy": 0.7456000745296478, "num_tokens": 30621116.0, "step": 939 }, { "entropy": 0.9420085400342941, "epoch": 0.5026737967914439, "grad_norm": 0.1593165397644043, "learning_rate": 0.00011381970828765336, "loss": 0.9376, "mean_token_accuracy": 0.7606199234724045, "num_tokens": 30653222.0, "step": 940 }, { "entropy": 0.9681316465139389, "epoch": 0.5032085561497326, "grad_norm": 0.1515263170003891, "learning_rate": 0.00011366388984694365, "loss": 0.9952, "mean_token_accuracy": 0.7483191341161728, "num_tokens": 30685809.0, "step": 941 }, { "entropy": 0.9541009217500687, "epoch": 0.5037433155080214, "grad_norm": 0.15999698638916016, "learning_rate": 0.00011350806040487245, "loss": 0.9562, "mean_token_accuracy": 0.7580645084381104, "num_tokens": 30718577.0, "step": 942 }, { "entropy": 0.9750600904226303, "epoch": 0.5042780748663102, "grad_norm": 0.14765378832817078, "learning_rate": 0.00011335222042934022, "loss": 0.9757, "mean_token_accuracy": 0.7525048851966858, "num_tokens": 30751345.0, "step": 943 }, { "entropy": 0.9981716424226761, "epoch": 0.5048128342245989, "grad_norm": 0.15309534966945648, "learning_rate": 0.00011319637038827918, "loss": 0.9948, "mean_token_accuracy": 0.7471605241298676, "num_tokens": 30784097.0, "step": 944 }, { "entropy": 0.9929149597883224, "epoch": 0.5053475935828877, "grad_norm": 0.14857131242752075, "learning_rate": 0.0001130405107496517, "loss": 0.9993, "mean_token_accuracy": 0.7473356425762177, "num_tokens": 30816631.0, "step": 945 }, { "entropy": 0.9805185496807098, "epoch": 0.5058823529411764, "grad_norm": 0.15281623601913452, "learning_rate": 0.000112884641981449, "loss": 0.9783, "mean_token_accuracy": 0.7508858740329742, "num_tokens": 30849399.0, "step": 946 }, { "entropy": 1.0177381336688995, "epoch": 0.5064171122994653, "grad_norm": 0.14827761054039001, "learning_rate": 0.00011272876455168965, "loss": 1.0084, "mean_token_accuracy": 0.7458373308181763, "num_tokens": 30882166.0, "step": 947 }, { "entropy": 0.9999857991933823, "epoch": 0.506951871657754, "grad_norm": 0.14711898565292358, "learning_rate": 0.00011257287892841835, "loss": 0.9881, "mean_token_accuracy": 0.7506720423698425, "num_tokens": 30914934.0, "step": 948 }, { "entropy": 1.0082168877124786, "epoch": 0.5074866310160427, "grad_norm": 0.14489367604255676, "learning_rate": 0.00011241698557970428, "loss": 1.008, "mean_token_accuracy": 0.7478005886077881, "num_tokens": 30947702.0, "step": 949 }, { "entropy": 0.9544709473848343, "epoch": 0.5080213903743316, "grad_norm": 0.1495532989501953, "learning_rate": 0.0001122610849736399, "loss": 0.9539, "mean_token_accuracy": 0.7564558535814285, "num_tokens": 30980308.0, "step": 950 }, { "entropy": 0.9981829226016998, "epoch": 0.5085561497326203, "grad_norm": 0.15191273391246796, "learning_rate": 0.0001121051775783394, "loss": 1.0066, "mean_token_accuracy": 0.7489893287420273, "num_tokens": 31012771.0, "step": 951 }, { "entropy": 0.9858329743146896, "epoch": 0.509090909090909, "grad_norm": 0.15609891712665558, "learning_rate": 0.00011194926386193738, "loss": 0.9947, "mean_token_accuracy": 0.7483371645212173, "num_tokens": 31045494.0, "step": 952 }, { "entropy": 0.951867401599884, "epoch": 0.5096256684491979, "grad_norm": 0.150090754032135, "learning_rate": 0.00011179334429258747, "loss": 0.9614, "mean_token_accuracy": 0.7558956444263458, "num_tokens": 31078262.0, "step": 953 }, { "entropy": 0.9978301674127579, "epoch": 0.5101604278074866, "grad_norm": 0.14746372401714325, "learning_rate": 0.00011163741933846077, "loss": 0.9908, "mean_token_accuracy": 0.7496818602085114, "num_tokens": 31110967.0, "step": 954 }, { "entropy": 1.0230722427368164, "epoch": 0.5106951871657754, "grad_norm": 0.14543987810611725, "learning_rate": 0.00011148148946774465, "loss": 1.0243, "mean_token_accuracy": 0.7438846528530121, "num_tokens": 31143570.0, "step": 955 }, { "entropy": 0.9597311317920685, "epoch": 0.5112299465240642, "grad_norm": 0.1489652544260025, "learning_rate": 0.00011132555514864118, "loss": 0.9572, "mean_token_accuracy": 0.7564149498939514, "num_tokens": 31176338.0, "step": 956 }, { "entropy": 1.0460785180330276, "epoch": 0.5117647058823529, "grad_norm": 0.1502375602722168, "learning_rate": 0.00011116961684936582, "loss": 1.0382, "mean_token_accuracy": 0.7419738918542862, "num_tokens": 31208888.0, "step": 957 }, { "entropy": 1.0283286571502686, "epoch": 0.5122994652406417, "grad_norm": 0.15182124078273773, "learning_rate": 0.00011101367503814591, "loss": 1.0118, "mean_token_accuracy": 0.7439312040805817, "num_tokens": 31241523.0, "step": 958 }, { "entropy": 1.035444438457489, "epoch": 0.5128342245989305, "grad_norm": 0.14735203981399536, "learning_rate": 0.00011085773018321947, "loss": 1.0272, "mean_token_accuracy": 0.744809091091156, "num_tokens": 31273776.0, "step": 959 }, { "entropy": 0.9777446836233139, "epoch": 0.5133689839572193, "grad_norm": 0.16726234555244446, "learning_rate": 0.00011070178275283359, "loss": 0.97, "mean_token_accuracy": 0.7573237866163254, "num_tokens": 31306310.0, "step": 960 }, { "entropy": 0.9591176509857178, "epoch": 0.513903743315508, "grad_norm": 0.16735734045505524, "learning_rate": 0.00011054583321524298, "loss": 0.9704, "mean_token_accuracy": 0.7537226974964142, "num_tokens": 31338881.0, "step": 961 }, { "entropy": 0.9389115124940872, "epoch": 0.5144385026737968, "grad_norm": 0.1522175818681717, "learning_rate": 0.00011038988203870888, "loss": 0.9487, "mean_token_accuracy": 0.7594985961914062, "num_tokens": 31371418.0, "step": 962 }, { "entropy": 0.9493906497955322, "epoch": 0.5149732620320856, "grad_norm": 0.16043366491794586, "learning_rate": 0.0001102339296914973, "loss": 0.9398, "mean_token_accuracy": 0.7587670981884003, "num_tokens": 31404186.0, "step": 963 }, { "entropy": 1.0073987543582916, "epoch": 0.5155080213903743, "grad_norm": 0.15452541410923004, "learning_rate": 0.00011007797664187784, "loss": 0.9966, "mean_token_accuracy": 0.7497439980506897, "num_tokens": 31436823.0, "step": 964 }, { "entropy": 0.9574171453714371, "epoch": 0.516042780748663, "grad_norm": 0.14766879379749298, "learning_rate": 0.00010992202335812218, "loss": 0.9608, "mean_token_accuracy": 0.7573619186878204, "num_tokens": 31469591.0, "step": 965 }, { "entropy": 1.042914018034935, "epoch": 0.5165775401069519, "grad_norm": 0.1512092649936676, "learning_rate": 0.00010976607030850275, "loss": 1.034, "mean_token_accuracy": 0.7408221811056137, "num_tokens": 31502350.0, "step": 966 }, { "entropy": 0.9637761563062668, "epoch": 0.5171122994652406, "grad_norm": 0.15066123008728027, "learning_rate": 0.00010961011796129117, "loss": 0.9702, "mean_token_accuracy": 0.755041629076004, "num_tokens": 31535071.0, "step": 967 }, { "entropy": 1.0045630931854248, "epoch": 0.5176470588235295, "grad_norm": 0.14877843856811523, "learning_rate": 0.00010945416678475707, "loss": 1.0027, "mean_token_accuracy": 0.7474340200424194, "num_tokens": 31567839.0, "step": 968 }, { "entropy": 0.9725369513034821, "epoch": 0.5181818181818182, "grad_norm": 0.15289999544620514, "learning_rate": 0.00010929821724716645, "loss": 0.9667, "mean_token_accuracy": 0.7536260038614273, "num_tokens": 31600299.0, "step": 969 }, { "entropy": 0.9684153497219086, "epoch": 0.5187165775401069, "grad_norm": 0.14860281348228455, "learning_rate": 0.00010914226981678055, "loss": 0.9442, "mean_token_accuracy": 0.7581672668457031, "num_tokens": 31632916.0, "step": 970 }, { "entropy": 0.9741877317428589, "epoch": 0.5192513368983958, "grad_norm": 0.14804121851921082, "learning_rate": 0.00010898632496185413, "loss": 0.974, "mean_token_accuracy": 0.7507331371307373, "num_tokens": 31665684.0, "step": 971 }, { "entropy": 0.9431410431861877, "epoch": 0.5197860962566845, "grad_norm": 0.1512025147676468, "learning_rate": 0.00010883038315063421, "loss": 0.9523, "mean_token_accuracy": 0.7577895820140839, "num_tokens": 31698452.0, "step": 972 }, { "entropy": 0.9640598893165588, "epoch": 0.5203208556149732, "grad_norm": 0.158531054854393, "learning_rate": 0.00010867444485135884, "loss": 0.9787, "mean_token_accuracy": 0.7556378543376923, "num_tokens": 31731149.0, "step": 973 }, { "entropy": 0.9679831564426422, "epoch": 0.520855614973262, "grad_norm": 0.14988689124584198, "learning_rate": 0.0001085185105322554, "loss": 0.9634, "mean_token_accuracy": 0.7536378353834152, "num_tokens": 31763506.0, "step": 974 }, { "entropy": 0.9720947295427322, "epoch": 0.5213903743315508, "grad_norm": 0.14716538786888123, "learning_rate": 0.00010836258066153925, "loss": 0.974, "mean_token_accuracy": 0.7534502297639847, "num_tokens": 31796162.0, "step": 975 }, { "entropy": 1.0021846890449524, "epoch": 0.5219251336898396, "grad_norm": 0.14980396628379822, "learning_rate": 0.00010820665570741255, "loss": 1.0027, "mean_token_accuracy": 0.7444467395544052, "num_tokens": 31828756.0, "step": 976 }, { "entropy": 1.0210732370615005, "epoch": 0.5224598930481283, "grad_norm": 0.14784151315689087, "learning_rate": 0.00010805073613806263, "loss": 1.0369, "mean_token_accuracy": 0.7449291348457336, "num_tokens": 31861524.0, "step": 977 }, { "entropy": 1.0197188556194305, "epoch": 0.5229946524064171, "grad_norm": 0.14898277819156647, "learning_rate": 0.00010789482242166065, "loss": 1.0113, "mean_token_accuracy": 0.7475687712430954, "num_tokens": 31893923.0, "step": 978 }, { "entropy": 0.9855650812387466, "epoch": 0.5235294117647059, "grad_norm": 0.14652405679225922, "learning_rate": 0.00010773891502636013, "loss": 0.9902, "mean_token_accuracy": 0.7477983981370926, "num_tokens": 31926465.0, "step": 979 }, { "entropy": 1.0192323327064514, "epoch": 0.5240641711229946, "grad_norm": 0.15139682590961456, "learning_rate": 0.00010758301442029573, "loss": 1.0169, "mean_token_accuracy": 0.7415623664855957, "num_tokens": 31958894.0, "step": 980 }, { "entropy": 1.0423056483268738, "epoch": 0.5245989304812835, "grad_norm": 0.15091025829315186, "learning_rate": 0.00010742712107158169, "loss": 1.048, "mean_token_accuracy": 0.737475574016571, "num_tokens": 31991662.0, "step": 981 }, { "entropy": 1.0017990171909332, "epoch": 0.5251336898395722, "grad_norm": 0.14979922771453857, "learning_rate": 0.0001072712354483104, "loss": 0.9875, "mean_token_accuracy": 0.7491593360900879, "num_tokens": 32024339.0, "step": 982 }, { "entropy": 0.9572014361619949, "epoch": 0.5256684491978609, "grad_norm": 0.16085804998874664, "learning_rate": 0.00010711535801855104, "loss": 0.952, "mean_token_accuracy": 0.7552261650562286, "num_tokens": 32056882.0, "step": 983 }, { "entropy": 0.9995868802070618, "epoch": 0.5262032085561498, "grad_norm": 0.15602126717567444, "learning_rate": 0.00010695948925034833, "loss": 1.0072, "mean_token_accuracy": 0.7474450767040253, "num_tokens": 32089485.0, "step": 984 }, { "entropy": 1.0375087559223175, "epoch": 0.5267379679144385, "grad_norm": 0.14877036213874817, "learning_rate": 0.00010680362961172086, "loss": 1.0437, "mean_token_accuracy": 0.7392778694629669, "num_tokens": 32122253.0, "step": 985 }, { "entropy": 0.9701367020606995, "epoch": 0.5272727272727272, "grad_norm": 0.15291082859039307, "learning_rate": 0.00010664777957065981, "loss": 0.969, "mean_token_accuracy": 0.7528199255466461, "num_tokens": 32154959.0, "step": 986 }, { "entropy": 0.9755905866622925, "epoch": 0.5278074866310161, "grad_norm": 0.15403801202774048, "learning_rate": 0.00010649193959512758, "loss": 0.9582, "mean_token_accuracy": 0.7578248232603073, "num_tokens": 32187679.0, "step": 987 }, { "entropy": 0.9252783358097076, "epoch": 0.5283422459893048, "grad_norm": 0.15754017233848572, "learning_rate": 0.00010633611015305636, "loss": 0.9317, "mean_token_accuracy": 0.7646315395832062, "num_tokens": 32220116.0, "step": 988 }, { "entropy": 0.9623700231313705, "epoch": 0.5288770053475936, "grad_norm": 0.16240595281124115, "learning_rate": 0.00010618029171234669, "loss": 0.9608, "mean_token_accuracy": 0.7552236020565033, "num_tokens": 32252884.0, "step": 989 }, { "entropy": 0.9905337542295456, "epoch": 0.5294117647058824, "grad_norm": 0.19256940484046936, "learning_rate": 0.00010602448474086602, "loss": 0.99, "mean_token_accuracy": 0.7485212236642838, "num_tokens": 32285038.0, "step": 990 }, { "entropy": 1.048939511179924, "epoch": 0.5299465240641711, "grad_norm": 0.15117883682250977, "learning_rate": 0.00010586868970644745, "loss": 1.0577, "mean_token_accuracy": 0.7386181652545929, "num_tokens": 32317800.0, "step": 991 }, { "entropy": 0.953388124704361, "epoch": 0.5304812834224599, "grad_norm": 0.1687232106924057, "learning_rate": 0.00010571290707688823, "loss": 0.9412, "mean_token_accuracy": 0.7602576017379761, "num_tokens": 32349982.0, "step": 992 }, { "entropy": 0.9880169332027435, "epoch": 0.5310160427807487, "grad_norm": 0.15983946621418, "learning_rate": 0.00010555713731994829, "loss": 0.9676, "mean_token_accuracy": 0.7497250735759735, "num_tokens": 32382750.0, "step": 993 }, { "entropy": 1.0043831318616867, "epoch": 0.5315508021390374, "grad_norm": 0.14981651306152344, "learning_rate": 0.00010540138090334897, "loss": 1.0026, "mean_token_accuracy": 0.7486239373683929, "num_tokens": 32415099.0, "step": 994 }, { "entropy": 1.0175066590309143, "epoch": 0.5320855614973262, "grad_norm": 0.16312158107757568, "learning_rate": 0.00010524563829477157, "loss": 1.025, "mean_token_accuracy": 0.7404081225395203, "num_tokens": 32447867.0, "step": 995 }, { "entropy": 0.9783085435628891, "epoch": 0.532620320855615, "grad_norm": 0.1860574185848236, "learning_rate": 0.0001050899099618559, "loss": 0.9823, "mean_token_accuracy": 0.7511302530765533, "num_tokens": 32480635.0, "step": 996 }, { "entropy": 1.0055098831653595, "epoch": 0.5331550802139038, "grad_norm": 0.1509351283311844, "learning_rate": 0.00010493419637219881, "loss": 1.0153, "mean_token_accuracy": 0.7382027506828308, "num_tokens": 32513121.0, "step": 997 }, { "entropy": 0.9564041495323181, "epoch": 0.5336898395721925, "grad_norm": 0.19098837673664093, "learning_rate": 0.00010477849799335306, "loss": 0.9433, "mean_token_accuracy": 0.7583567798137665, "num_tokens": 32545807.0, "step": 998 }, { "entropy": 0.9641523063182831, "epoch": 0.5342245989304812, "grad_norm": 0.1565517634153366, "learning_rate": 0.00010462281529282567, "loss": 0.9567, "mean_token_accuracy": 0.7590420246124268, "num_tokens": 32578575.0, "step": 999 }, { "entropy": 0.9831237494945526, "epoch": 0.5347593582887701, "grad_norm": 0.1680804342031479, "learning_rate": 0.00010446714873807648, "loss": 0.974, "mean_token_accuracy": 0.7528853267431259, "num_tokens": 32610897.0, "step": 1000 }, { "entropy": 0.9834153801202774, "epoch": 0.5352941176470588, "grad_norm": 0.17859195172786713, "learning_rate": 0.00010431149879651699, "loss": 0.9688, "mean_token_accuracy": 0.7543960511684418, "num_tokens": 32643225.0, "step": 1001 }, { "entropy": 0.9904739707708359, "epoch": 0.5358288770053476, "grad_norm": 0.15401509404182434, "learning_rate": 0.00010415586593550874, "loss": 0.9977, "mean_token_accuracy": 0.752443790435791, "num_tokens": 32675993.0, "step": 1002 }, { "entropy": 1.036316692829132, "epoch": 0.5363636363636364, "grad_norm": 0.18634629249572754, "learning_rate": 0.00010400025062236205, "loss": 1.0428, "mean_token_accuracy": 0.7439382523298264, "num_tokens": 32708720.0, "step": 1003 }, { "entropy": 0.9291573613882065, "epoch": 0.5368983957219251, "grad_norm": 0.16239655017852783, "learning_rate": 0.00010384465332433444, "loss": 0.9462, "mean_token_accuracy": 0.7608358263969421, "num_tokens": 32741274.0, "step": 1004 }, { "entropy": 0.9835901707410812, "epoch": 0.5374331550802139, "grad_norm": 0.15718552470207214, "learning_rate": 0.00010368907450862944, "loss": 0.994, "mean_token_accuracy": 0.7442265450954437, "num_tokens": 32774042.0, "step": 1005 }, { "entropy": 0.9875447750091553, "epoch": 0.5379679144385027, "grad_norm": 0.14904296398162842, "learning_rate": 0.00010353351464239502, "loss": 0.9828, "mean_token_accuracy": 0.7503054738044739, "num_tokens": 32806810.0, "step": 1006 }, { "entropy": 0.9224865585565567, "epoch": 0.5385026737967914, "grad_norm": 0.1498638242483139, "learning_rate": 0.00010337797419272232, "loss": 0.9247, "mean_token_accuracy": 0.7628298997879028, "num_tokens": 32839578.0, "step": 1007 }, { "entropy": 0.9844998568296432, "epoch": 0.5390374331550802, "grad_norm": 0.14565370976924896, "learning_rate": 0.00010322245362664411, "loss": 0.9869, "mean_token_accuracy": 0.7503665685653687, "num_tokens": 32872346.0, "step": 1008 }, { "entropy": 0.9935092180967331, "epoch": 0.539572192513369, "grad_norm": 0.14800257980823517, "learning_rate": 0.00010306695341113348, "loss": 0.9729, "mean_token_accuracy": 0.7524667382240295, "num_tokens": 32904790.0, "step": 1009 }, { "entropy": 0.9884016215801239, "epoch": 0.5401069518716578, "grad_norm": 0.1749376505613327, "learning_rate": 0.00010291147401310244, "loss": 0.9811, "mean_token_accuracy": 0.7519508004188538, "num_tokens": 32937098.0, "step": 1010 }, { "entropy": 0.9943345189094543, "epoch": 0.5406417112299465, "grad_norm": 0.16141724586486816, "learning_rate": 0.0001027560158994004, "loss": 0.9946, "mean_token_accuracy": 0.7510824054479599, "num_tokens": 32969322.0, "step": 1011 }, { "entropy": 0.9604488164186478, "epoch": 0.5411764705882353, "grad_norm": 0.1509343534708023, "learning_rate": 0.00010260057953681299, "loss": 0.9533, "mean_token_accuracy": 0.7595346719026566, "num_tokens": 33002009.0, "step": 1012 }, { "entropy": 1.0147838592529297, "epoch": 0.5417112299465241, "grad_norm": 0.1797001212835312, "learning_rate": 0.00010244516539206048, "loss": 1.0128, "mean_token_accuracy": 0.7456928193569183, "num_tokens": 33034777.0, "step": 1013 }, { "entropy": 0.9897986203432083, "epoch": 0.5422459893048128, "grad_norm": 0.15886810421943665, "learning_rate": 0.00010228977393179641, "loss": 0.995, "mean_token_accuracy": 0.7470195144414902, "num_tokens": 33067285.0, "step": 1014 }, { "entropy": 0.9278879165649414, "epoch": 0.5427807486631016, "grad_norm": 0.15260817110538483, "learning_rate": 0.00010213440562260616, "loss": 0.9347, "mean_token_accuracy": 0.7599593251943588, "num_tokens": 33099583.0, "step": 1015 }, { "entropy": 0.9589514136314392, "epoch": 0.5433155080213904, "grad_norm": 0.16330361366271973, "learning_rate": 0.0001019790609310057, "loss": 0.9709, "mean_token_accuracy": 0.7548265010118484, "num_tokens": 33132351.0, "step": 1016 }, { "entropy": 0.9311648309230804, "epoch": 0.5438502673796791, "grad_norm": 0.15833255648612976, "learning_rate": 0.00010182374032344005, "loss": 0.9228, "mean_token_accuracy": 0.7587970197200775, "num_tokens": 33165070.0, "step": 1017 }, { "entropy": 0.9727670103311539, "epoch": 0.544385026737968, "grad_norm": 0.15479743480682373, "learning_rate": 0.00010166844426628186, "loss": 0.9725, "mean_token_accuracy": 0.7509775161743164, "num_tokens": 33197838.0, "step": 1018 }, { "entropy": 0.9768361747264862, "epoch": 0.5449197860962567, "grad_norm": 0.18230433762073517, "learning_rate": 0.0001015131732258301, "loss": 0.9838, "mean_token_accuracy": 0.7502443790435791, "num_tokens": 33230606.0, "step": 1019 }, { "entropy": 1.0158773809671402, "epoch": 0.5454545454545454, "grad_norm": 0.17845484614372253, "learning_rate": 0.00010135792766830865, "loss": 1.0109, "mean_token_accuracy": 0.7454035729169846, "num_tokens": 33263126.0, "step": 1020 }, { "entropy": 1.011165291070938, "epoch": 0.5459893048128343, "grad_norm": 0.15542946755886078, "learning_rate": 0.00010120270805986487, "loss": 1.0111, "mean_token_accuracy": 0.7446542084217072, "num_tokens": 33295894.0, "step": 1021 }, { "entropy": 0.9618840366601944, "epoch": 0.546524064171123, "grad_norm": 0.19388443231582642, "learning_rate": 0.00010104751486656809, "loss": 0.9531, "mean_token_accuracy": 0.7557734698057175, "num_tokens": 33328662.0, "step": 1022 }, { "entropy": 0.9803057909011841, "epoch": 0.5470588235294118, "grad_norm": 0.1536754071712494, "learning_rate": 0.00010089234855440852, "loss": 0.9603, "mean_token_accuracy": 0.7563233077526093, "num_tokens": 33361430.0, "step": 1023 }, { "entropy": 1.0168762803077698, "epoch": 0.5475935828877005, "grad_norm": 0.2355695366859436, "learning_rate": 0.00010073720958929548, "loss": 1.0217, "mean_token_accuracy": 0.7425296157598495, "num_tokens": 33393519.0, "step": 1024 }, { "entropy": 1.0100078284740448, "epoch": 0.5481283422459893, "grad_norm": 0.17015263438224792, "learning_rate": 0.00010058209843705632, "loss": 1.0212, "mean_token_accuracy": 0.7407746911048889, "num_tokens": 33426287.0, "step": 1025 }, { "entropy": 0.978960245847702, "epoch": 0.5486631016042781, "grad_norm": 0.16321249306201935, "learning_rate": 0.00010042701556343477, "loss": 0.9714, "mean_token_accuracy": 0.7537573277950287, "num_tokens": 33459055.0, "step": 1026 }, { "entropy": 1.0137168616056442, "epoch": 0.5491978609625668, "grad_norm": 0.16961227357387543, "learning_rate": 0.00010027196143408968, "loss": 1.0129, "mean_token_accuracy": 0.7448566257953644, "num_tokens": 33491781.0, "step": 1027 }, { "entropy": 0.9298945963382721, "epoch": 0.5497326203208556, "grad_norm": 0.14900726079940796, "learning_rate": 0.00010011693651459362, "loss": 0.9123, "mean_token_accuracy": 0.7624548673629761, "num_tokens": 33524362.0, "step": 1028 }, { "entropy": 0.9306894838809967, "epoch": 0.5502673796791444, "grad_norm": 0.16008970141410828, "learning_rate": 9.996194127043142e-05, "loss": 0.9353, "mean_token_accuracy": 0.759422555565834, "num_tokens": 33556506.0, "step": 1029 }, { "entropy": 0.973898783326149, "epoch": 0.5508021390374331, "grad_norm": 0.1580318957567215, "learning_rate": 9.980697616699878e-05, "loss": 0.9867, "mean_token_accuracy": 0.7452651560306549, "num_tokens": 33589274.0, "step": 1030 }, { "entropy": 0.9807162135839462, "epoch": 0.551336898395722, "grad_norm": 0.1588246375322342, "learning_rate": 9.9652041669601e-05, "loss": 0.972, "mean_token_accuracy": 0.7508553266525269, "num_tokens": 33622042.0, "step": 1031 }, { "entropy": 0.9518315494060516, "epoch": 0.5518716577540107, "grad_norm": 0.16469627618789673, "learning_rate": 9.949713824345142e-05, "loss": 0.9637, "mean_token_accuracy": 0.7573671042919159, "num_tokens": 33654776.0, "step": 1032 }, { "entropy": 1.003676250576973, "epoch": 0.5524064171122994, "grad_norm": 0.16015905141830444, "learning_rate": 9.934226635367001e-05, "loss": 1.0174, "mean_token_accuracy": 0.7440948337316513, "num_tokens": 33687412.0, "step": 1033 }, { "entropy": 0.9762967377901077, "epoch": 0.5529411764705883, "grad_norm": 0.16960027813911438, "learning_rate": 9.918742646528217e-05, "loss": 0.9947, "mean_token_accuracy": 0.7487170100212097, "num_tokens": 33720180.0, "step": 1034 }, { "entropy": 1.021599069237709, "epoch": 0.553475935828877, "grad_norm": 0.15136435627937317, "learning_rate": 9.903261904321713e-05, "loss": 1.0245, "mean_token_accuracy": 0.7440814822912216, "num_tokens": 33752595.0, "step": 1035 }, { "entropy": 0.9720664918422699, "epoch": 0.5540106951871657, "grad_norm": 0.15822596848011017, "learning_rate": 9.887784455230673e-05, "loss": 0.9758, "mean_token_accuracy": 0.7544098794460297, "num_tokens": 33785146.0, "step": 1036 }, { "entropy": 1.010760858654976, "epoch": 0.5545454545454546, "grad_norm": 0.15759292244911194, "learning_rate": 9.872310345728375e-05, "loss": 0.999, "mean_token_accuracy": 0.744012713432312, "num_tokens": 33817914.0, "step": 1037 }, { "entropy": 0.9856355041265488, "epoch": 0.5550802139037433, "grad_norm": 0.15385037660598755, "learning_rate": 9.856839622278091e-05, "loss": 0.9708, "mean_token_accuracy": 0.752427726984024, "num_tokens": 33850642.0, "step": 1038 }, { "entropy": 0.9753337502479553, "epoch": 0.5556149732620321, "grad_norm": 0.15805310010910034, "learning_rate": 9.841372331332915e-05, "loss": 0.9631, "mean_token_accuracy": 0.758213609457016, "num_tokens": 33883110.0, "step": 1039 }, { "entropy": 0.9520341902971268, "epoch": 0.5561497326203209, "grad_norm": 0.15785223245620728, "learning_rate": 9.825908519335624e-05, "loss": 0.9572, "mean_token_accuracy": 0.7571136206388474, "num_tokens": 33915493.0, "step": 1040 }, { "entropy": 1.0030668526887894, "epoch": 0.5566844919786096, "grad_norm": 0.18076303601264954, "learning_rate": 9.810448232718574e-05, "loss": 0.9865, "mean_token_accuracy": 0.7505433708429337, "num_tokens": 33948105.0, "step": 1041 }, { "entropy": 1.00014129281044, "epoch": 0.5572192513368984, "grad_norm": 0.15228135883808136, "learning_rate": 9.794991517903513e-05, "loss": 1.0084, "mean_token_accuracy": 0.7415904402732849, "num_tokens": 33980868.0, "step": 1042 }, { "entropy": 0.9635073244571686, "epoch": 0.5577540106951872, "grad_norm": 0.17229047417640686, "learning_rate": 9.779538421301478e-05, "loss": 0.9665, "mean_token_accuracy": 0.7538184225559235, "num_tokens": 34013636.0, "step": 1043 }, { "entropy": 0.9166641980409622, "epoch": 0.558288770053476, "grad_norm": 0.15340466797351837, "learning_rate": 9.76408898931263e-05, "loss": 0.9141, "mean_token_accuracy": 0.7659208625555038, "num_tokens": 34046231.0, "step": 1044 }, { "entropy": 0.991406574845314, "epoch": 0.5588235294117647, "grad_norm": 0.1854625940322876, "learning_rate": 9.748643268326138e-05, "loss": 0.9961, "mean_token_accuracy": 0.7477089464664459, "num_tokens": 34078999.0, "step": 1045 }, { "entropy": 0.9557927846908569, "epoch": 0.5593582887700534, "grad_norm": 0.14812085032463074, "learning_rate": 9.733201304720018e-05, "loss": 0.9597, "mean_token_accuracy": 0.753896564245224, "num_tokens": 34111761.0, "step": 1046 }, { "entropy": 0.9458355009555817, "epoch": 0.5598930481283423, "grad_norm": 0.21885894238948822, "learning_rate": 9.717763144861018e-05, "loss": 0.9483, "mean_token_accuracy": 0.7567524760961533, "num_tokens": 34144427.0, "step": 1047 }, { "entropy": 0.9695868790149689, "epoch": 0.560427807486631, "grad_norm": 0.174393892288208, "learning_rate": 9.702328835104451e-05, "loss": 0.9599, "mean_token_accuracy": 0.7538753300905228, "num_tokens": 34177075.0, "step": 1048 }, { "entropy": 0.9795443713665009, "epoch": 0.5609625668449197, "grad_norm": 0.19289985299110413, "learning_rate": 9.68689842179407e-05, "loss": 0.969, "mean_token_accuracy": 0.7516495585441589, "num_tokens": 34209843.0, "step": 1049 }, { "entropy": 0.9906342476606369, "epoch": 0.5614973262032086, "grad_norm": 0.18270674347877502, "learning_rate": 9.671471951261939e-05, "loss": 0.992, "mean_token_accuracy": 0.7464437782764435, "num_tokens": 34241903.0, "step": 1050 }, { "entropy": 0.9662675708532333, "epoch": 0.5620320855614973, "grad_norm": 0.19764238595962524, "learning_rate": 9.65604946982827e-05, "loss": 0.9758, "mean_token_accuracy": 0.7566316574811935, "num_tokens": 34274573.0, "step": 1051 }, { "entropy": 0.9911040216684341, "epoch": 0.5625668449197861, "grad_norm": 0.18982456624507904, "learning_rate": 9.640631023801312e-05, "loss": 0.9882, "mean_token_accuracy": 0.7486253678798676, "num_tokens": 34307341.0, "step": 1052 }, { "entropy": 0.9605313539505005, "epoch": 0.5631016042780749, "grad_norm": 0.1607426106929779, "learning_rate": 9.62521665947718e-05, "loss": 0.946, "mean_token_accuracy": 0.7591642141342163, "num_tokens": 34340109.0, "step": 1053 }, { "entropy": 1.0086674988269806, "epoch": 0.5636363636363636, "grad_norm": 0.16547992825508118, "learning_rate": 9.609806423139756e-05, "loss": 1.0119, "mean_token_accuracy": 0.7409510314464569, "num_tokens": 34372763.0, "step": 1054 }, { "entropy": 0.9168087691068649, "epoch": 0.5641711229946524, "grad_norm": 0.16736042499542236, "learning_rate": 9.594400361060501e-05, "loss": 0.9146, "mean_token_accuracy": 0.7686299085617065, "num_tokens": 34404772.0, "step": 1055 }, { "entropy": 1.0090791881084442, "epoch": 0.5647058823529412, "grad_norm": 0.17390309274196625, "learning_rate": 9.578998519498352e-05, "loss": 1.01, "mean_token_accuracy": 0.7473423779010773, "num_tokens": 34437540.0, "step": 1056 }, { "entropy": 0.9466338604688644, "epoch": 0.5652406417112299, "grad_norm": 0.15810315310955048, "learning_rate": 9.563600944699588e-05, "loss": 0.9524, "mean_token_accuracy": 0.7534824013710022, "num_tokens": 34470308.0, "step": 1057 }, { "entropy": 0.98748779296875, "epoch": 0.5657754010695187, "grad_norm": 0.167922243475914, "learning_rate": 9.548207682897656e-05, "loss": 0.9992, "mean_token_accuracy": 0.7505677789449692, "num_tokens": 34502945.0, "step": 1058 }, { "entropy": 0.9745074212551117, "epoch": 0.5663101604278075, "grad_norm": 0.16281603276729584, "learning_rate": 9.532818780313064e-05, "loss": 0.9697, "mean_token_accuracy": 0.7572364807128906, "num_tokens": 34535692.0, "step": 1059 }, { "entropy": 0.9525927603244781, "epoch": 0.5668449197860963, "grad_norm": 0.1510886698961258, "learning_rate": 9.517434283153224e-05, "loss": 0.9686, "mean_token_accuracy": 0.7546126544475555, "num_tokens": 34568460.0, "step": 1060 }, { "entropy": 0.9627717286348343, "epoch": 0.567379679144385, "grad_norm": 0.2195834219455719, "learning_rate": 9.502054237612332e-05, "loss": 0.9488, "mean_token_accuracy": 0.7562622129917145, "num_tokens": 34601228.0, "step": 1061 }, { "entropy": 0.9885230660438538, "epoch": 0.5679144385026738, "grad_norm": 0.16450177133083344, "learning_rate": 9.486678689871199e-05, "loss": 0.9812, "mean_token_accuracy": 0.7506720423698425, "num_tokens": 34633996.0, "step": 1062 }, { "entropy": 0.9659951478242874, "epoch": 0.5684491978609626, "grad_norm": 0.18741483986377716, "learning_rate": 9.471307686097142e-05, "loss": 0.9614, "mean_token_accuracy": 0.7537059783935547, "num_tokens": 34666458.0, "step": 1063 }, { "entropy": 1.0185732692480087, "epoch": 0.5689839572192513, "grad_norm": 0.16106286644935608, "learning_rate": 9.455941272443838e-05, "loss": 1.0255, "mean_token_accuracy": 0.7418881952762604, "num_tokens": 34698844.0, "step": 1064 }, { "entropy": 0.9865347892045975, "epoch": 0.56951871657754, "grad_norm": 0.1745121031999588, "learning_rate": 9.440579495051178e-05, "loss": 0.9701, "mean_token_accuracy": 0.7532685697078705, "num_tokens": 34731612.0, "step": 1065 }, { "entropy": 0.9505593776702881, "epoch": 0.5700534759358289, "grad_norm": 0.14933449029922485, "learning_rate": 9.425222400045124e-05, "loss": 0.9413, "mean_token_accuracy": 0.7585378736257553, "num_tokens": 34764209.0, "step": 1066 }, { "entropy": 0.996794581413269, "epoch": 0.5705882352941176, "grad_norm": 0.18804962933063507, "learning_rate": 9.409870033537584e-05, "loss": 0.9973, "mean_token_accuracy": 0.7489432394504547, "num_tokens": 34796870.0, "step": 1067 }, { "entropy": 0.9904070794582367, "epoch": 0.5711229946524065, "grad_norm": 0.1591782420873642, "learning_rate": 9.394522441626276e-05, "loss": 0.9844, "mean_token_accuracy": 0.7517068386077881, "num_tokens": 34829618.0, "step": 1068 }, { "entropy": 0.9387254118919373, "epoch": 0.5716577540106952, "grad_norm": 0.19616670906543732, "learning_rate": 9.379179670394566e-05, "loss": 0.9328, "mean_token_accuracy": 0.7617285847663879, "num_tokens": 34862260.0, "step": 1069 }, { "entropy": 0.9555445462465286, "epoch": 0.5721925133689839, "grad_norm": 0.1653156578540802, "learning_rate": 9.36384176591136e-05, "loss": 0.9705, "mean_token_accuracy": 0.7522734850645065, "num_tokens": 34895013.0, "step": 1070 }, { "entropy": 0.9247808158397675, "epoch": 0.5727272727272728, "grad_norm": 0.1534564346075058, "learning_rate": 9.348508774230942e-05, "loss": 0.9272, "mean_token_accuracy": 0.7621528804302216, "num_tokens": 34927693.0, "step": 1071 }, { "entropy": 0.9609176367521286, "epoch": 0.5732620320855615, "grad_norm": 0.18071511387825012, "learning_rate": 9.333180741392849e-05, "loss": 0.9683, "mean_token_accuracy": 0.7553821057081223, "num_tokens": 34959731.0, "step": 1072 }, { "entropy": 0.9573362022638321, "epoch": 0.5737967914438503, "grad_norm": 0.15922591090202332, "learning_rate": 9.317857713421723e-05, "loss": 0.9664, "mean_token_accuracy": 0.7546432018280029, "num_tokens": 34992499.0, "step": 1073 }, { "entropy": 0.9590311795473099, "epoch": 0.574331550802139, "grad_norm": 0.21154066920280457, "learning_rate": 9.30253973632718e-05, "loss": 0.9667, "mean_token_accuracy": 0.7547042965888977, "num_tokens": 35025267.0, "step": 1074 }, { "entropy": 0.9398636966943741, "epoch": 0.5748663101604278, "grad_norm": 0.15687710046768188, "learning_rate": 9.287226856103677e-05, "loss": 0.955, "mean_token_accuracy": 0.756537139415741, "num_tokens": 35058035.0, "step": 1075 }, { "entropy": 1.0047118067741394, "epoch": 0.5754010695187166, "grad_norm": 0.17438744008541107, "learning_rate": 9.271919118730358e-05, "loss": 1.0077, "mean_token_accuracy": 0.7462121248245239, "num_tokens": 35090803.0, "step": 1076 }, { "entropy": 0.9554581344127655, "epoch": 0.5759358288770053, "grad_norm": 0.17973192036151886, "learning_rate": 9.25661657017093e-05, "loss": 0.9418, "mean_token_accuracy": 0.7558650970458984, "num_tokens": 35123571.0, "step": 1077 }, { "entropy": 0.9667383879423141, "epoch": 0.5764705882352941, "grad_norm": 0.18665874004364014, "learning_rate": 9.241319256373518e-05, "loss": 0.938, "mean_token_accuracy": 0.7555632442235947, "num_tokens": 35156255.0, "step": 1078 }, { "entropy": 1.0221573561429977, "epoch": 0.5770053475935829, "grad_norm": 0.1763426810503006, "learning_rate": 9.226027223270528e-05, "loss": 1.0139, "mean_token_accuracy": 0.7463648617267609, "num_tokens": 35189023.0, "step": 1079 }, { "entropy": 1.031988486647606, "epoch": 0.5775401069518716, "grad_norm": 0.16598355770111084, "learning_rate": 9.210740516778511e-05, "loss": 1.0285, "mean_token_accuracy": 0.7438599765300751, "num_tokens": 35221791.0, "step": 1080 }, { "entropy": 0.9958008080720901, "epoch": 0.5780748663101605, "grad_norm": 0.16901369392871857, "learning_rate": 9.195459182798021e-05, "loss": 1.0063, "mean_token_accuracy": 0.7480481863021851, "num_tokens": 35254386.0, "step": 1081 }, { "entropy": 0.9488922357559204, "epoch": 0.5786096256684492, "grad_norm": 0.17280098795890808, "learning_rate": 9.180183267213487e-05, "loss": 0.9483, "mean_token_accuracy": 0.7611047327518463, "num_tokens": 35286675.0, "step": 1082 }, { "entropy": 0.9606172740459442, "epoch": 0.5791443850267379, "grad_norm": 0.17144496738910675, "learning_rate": 9.16491281589307e-05, "loss": 0.9474, "mean_token_accuracy": 0.7588587403297424, "num_tokens": 35319443.0, "step": 1083 }, { "entropy": 0.9573392570018768, "epoch": 0.5796791443850268, "grad_norm": 0.18120399117469788, "learning_rate": 9.149647874688508e-05, "loss": 0.9548, "mean_token_accuracy": 0.7587325572967529, "num_tokens": 35352193.0, "step": 1084 }, { "entropy": 1.0083431005477905, "epoch": 0.5802139037433155, "grad_norm": 0.16116108000278473, "learning_rate": 9.134388489435008e-05, "loss": 1.0083, "mean_token_accuracy": 0.7453295439481735, "num_tokens": 35384500.0, "step": 1085 }, { "entropy": 0.9728332459926605, "epoch": 0.5807486631016042, "grad_norm": 0.18752528727054596, "learning_rate": 9.119134705951096e-05, "loss": 0.983, "mean_token_accuracy": 0.7519244849681854, "num_tokens": 35417268.0, "step": 1086 }, { "entropy": 0.9972249269485474, "epoch": 0.5812834224598931, "grad_norm": 0.1591930240392685, "learning_rate": 9.103886570038467e-05, "loss": 1.0113, "mean_token_accuracy": 0.7478616833686829, "num_tokens": 35450036.0, "step": 1087 }, { "entropy": 0.9868172854185104, "epoch": 0.5818181818181818, "grad_norm": 0.18106745183467865, "learning_rate": 9.088644127481863e-05, "loss": 0.9828, "mean_token_accuracy": 0.7541590183973312, "num_tokens": 35482512.0, "step": 1088 }, { "entropy": 1.004557117819786, "epoch": 0.5823529411764706, "grad_norm": 0.15707138180732727, "learning_rate": 9.073407424048937e-05, "loss": 1.0098, "mean_token_accuracy": 0.7464142739772797, "num_tokens": 35514811.0, "step": 1089 }, { "entropy": 0.9354948103427887, "epoch": 0.5828877005347594, "grad_norm": 0.16020675003528595, "learning_rate": 9.058176505490106e-05, "loss": 0.9314, "mean_token_accuracy": 0.7615575194358826, "num_tokens": 35547361.0, "step": 1090 }, { "entropy": 0.9814960062503815, "epoch": 0.5834224598930481, "grad_norm": 0.15799638628959656, "learning_rate": 9.04295141753841e-05, "loss": 0.9946, "mean_token_accuracy": 0.7523322403430939, "num_tokens": 35580081.0, "step": 1091 }, { "entropy": 1.014263927936554, "epoch": 0.5839572192513369, "grad_norm": 0.17104533314704895, "learning_rate": 9.027732205909389e-05, "loss": 1.0043, "mean_token_accuracy": 0.7447763979434967, "num_tokens": 35612849.0, "step": 1092 }, { "entropy": 0.9643675088882446, "epoch": 0.5844919786096257, "grad_norm": 0.15428514778614044, "learning_rate": 9.012518916300939e-05, "loss": 0.939, "mean_token_accuracy": 0.7590114772319794, "num_tokens": 35645617.0, "step": 1093 }, { "entropy": 0.9955264329910278, "epoch": 0.5850267379679145, "grad_norm": 0.16135382652282715, "learning_rate": 8.997311594393172e-05, "loss": 0.9938, "mean_token_accuracy": 0.7491723150014877, "num_tokens": 35678042.0, "step": 1094 }, { "entropy": 0.9898623675107956, "epoch": 0.5855614973262032, "grad_norm": 0.1541261523962021, "learning_rate": 8.982110285848278e-05, "loss": 0.9866, "mean_token_accuracy": 0.7481366097927094, "num_tokens": 35710810.0, "step": 1095 }, { "entropy": 0.954077735543251, "epoch": 0.586096256684492, "grad_norm": 0.1534184366464615, "learning_rate": 8.966915036310396e-05, "loss": 0.9543, "mean_token_accuracy": 0.7587708383798599, "num_tokens": 35743567.0, "step": 1096 }, { "entropy": 0.956783264875412, "epoch": 0.5866310160427808, "grad_norm": 0.16506008803844452, "learning_rate": 8.951725891405476e-05, "loss": 0.9501, "mean_token_accuracy": 0.755631148815155, "num_tokens": 35776045.0, "step": 1097 }, { "entropy": 1.0246059000492096, "epoch": 0.5871657754010695, "grad_norm": 0.1616167426109314, "learning_rate": 8.936542896741123e-05, "loss": 1.0158, "mean_token_accuracy": 0.7454789876937866, "num_tokens": 35808813.0, "step": 1098 }, { "entropy": 0.9628048241138458, "epoch": 0.5877005347593582, "grad_norm": 0.16270913183689117, "learning_rate": 8.921366097906484e-05, "loss": 0.9509, "mean_token_accuracy": 0.7559598535299301, "num_tokens": 35841233.0, "step": 1099 }, { "entropy": 0.9961771219968796, "epoch": 0.5882352941176471, "grad_norm": 0.16457895934581757, "learning_rate": 8.906195540472108e-05, "loss": 0.972, "mean_token_accuracy": 0.7511082738637924, "num_tokens": 35873894.0, "step": 1100 }, { "entropy": 0.9758996218442917, "epoch": 0.5887700534759358, "grad_norm": 0.17612816393375397, "learning_rate": 8.8910312699898e-05, "loss": 0.9898, "mean_token_accuracy": 0.7528423368930817, "num_tokens": 35906576.0, "step": 1101 }, { "entropy": 0.9655701667070389, "epoch": 0.5893048128342246, "grad_norm": 0.15732590854167938, "learning_rate": 8.875873331992477e-05, "loss": 0.9654, "mean_token_accuracy": 0.7554679811000824, "num_tokens": 35939344.0, "step": 1102 }, { "entropy": 0.9655955731868744, "epoch": 0.5898395721925134, "grad_norm": 0.19670598208904266, "learning_rate": 8.860721771994053e-05, "loss": 0.9602, "mean_token_accuracy": 0.7536127120256424, "num_tokens": 35971865.0, "step": 1103 }, { "entropy": 1.0124208182096481, "epoch": 0.5903743315508021, "grad_norm": 0.1736741065979004, "learning_rate": 8.845576635489292e-05, "loss": 1.0143, "mean_token_accuracy": 0.7426514476537704, "num_tokens": 36004347.0, "step": 1104 }, { "entropy": 0.9384248703718185, "epoch": 0.5909090909090909, "grad_norm": 0.17268306016921997, "learning_rate": 8.830437967953664e-05, "loss": 0.945, "mean_token_accuracy": 0.7619134783744812, "num_tokens": 36037115.0, "step": 1105 }, { "entropy": 0.9208074063062668, "epoch": 0.5914438502673797, "grad_norm": 0.20721158385276794, "learning_rate": 8.815305814843214e-05, "loss": 0.9171, "mean_token_accuracy": 0.7619870901107788, "num_tokens": 36069843.0, "step": 1106 }, { "entropy": 0.9940818250179291, "epoch": 0.5919786096256684, "grad_norm": 0.18215195834636688, "learning_rate": 8.80018022159443e-05, "loss": 1.008, "mean_token_accuracy": 0.7472860366106033, "num_tokens": 36102079.0, "step": 1107 }, { "entropy": 0.9343003779649734, "epoch": 0.5925133689839572, "grad_norm": 0.20239266753196716, "learning_rate": 8.785061233624113e-05, "loss": 0.9514, "mean_token_accuracy": 0.7605693936347961, "num_tokens": 36134847.0, "step": 1108 }, { "entropy": 0.9898380935192108, "epoch": 0.593048128342246, "grad_norm": 0.15859541296958923, "learning_rate": 8.76994889632921e-05, "loss": 0.9993, "mean_token_accuracy": 0.7506109476089478, "num_tokens": 36167615.0, "step": 1109 }, { "entropy": 1.002129703760147, "epoch": 0.5935828877005348, "grad_norm": 0.1886453926563263, "learning_rate": 8.754843255086711e-05, "loss": 1.0126, "mean_token_accuracy": 0.7439516186714172, "num_tokens": 36200383.0, "step": 1110 }, { "entropy": 0.9945786744356155, "epoch": 0.5941176470588235, "grad_norm": 0.15921494364738464, "learning_rate": 8.739744355253501e-05, "loss": 0.9889, "mean_token_accuracy": 0.7528126686811447, "num_tokens": 36232975.0, "step": 1111 }, { "entropy": 0.9854760468006134, "epoch": 0.5946524064171123, "grad_norm": 0.16590659320354462, "learning_rate": 8.724652242166223e-05, "loss": 0.9882, "mean_token_accuracy": 0.7484716773033142, "num_tokens": 36265471.0, "step": 1112 }, { "entropy": 0.9827838242053986, "epoch": 0.5951871657754011, "grad_norm": 0.1649196296930313, "learning_rate": 8.709566961141129e-05, "loss": 0.9743, "mean_token_accuracy": 0.751588299870491, "num_tokens": 36298105.0, "step": 1113 }, { "entropy": 0.9589742720127106, "epoch": 0.5957219251336898, "grad_norm": 0.15379104018211365, "learning_rate": 8.694488557473976e-05, "loss": 0.9509, "mean_token_accuracy": 0.7559805363416672, "num_tokens": 36330617.0, "step": 1114 }, { "entropy": 0.9678229987621307, "epoch": 0.5962566844919787, "grad_norm": 0.1546468585729599, "learning_rate": 8.679417076439862e-05, "loss": 0.9535, "mean_token_accuracy": 0.7550341635942459, "num_tokens": 36362990.0, "step": 1115 }, { "entropy": 1.0166302472352982, "epoch": 0.5967914438502674, "grad_norm": 0.16416624188423157, "learning_rate": 8.66435256329309e-05, "loss": 1.0267, "mean_token_accuracy": 0.7411920875310898, "num_tokens": 36395335.0, "step": 1116 }, { "entropy": 0.9371855705976486, "epoch": 0.5973262032085561, "grad_norm": 0.15952086448669434, "learning_rate": 8.649295063267056e-05, "loss": 0.9313, "mean_token_accuracy": 0.7642102837562561, "num_tokens": 36427288.0, "step": 1117 }, { "entropy": 0.994782492518425, "epoch": 0.597860962566845, "grad_norm": 0.15828780829906464, "learning_rate": 8.634244621574088e-05, "loss": 0.9903, "mean_token_accuracy": 0.7489925920963287, "num_tokens": 36459884.0, "step": 1118 }, { "entropy": 0.9942251890897751, "epoch": 0.5983957219251337, "grad_norm": 0.16231967508792877, "learning_rate": 8.619201283405327e-05, "loss": 0.987, "mean_token_accuracy": 0.7529947757720947, "num_tokens": 36492227.0, "step": 1119 }, { "entropy": 0.979880079627037, "epoch": 0.5989304812834224, "grad_norm": 0.15880082547664642, "learning_rate": 8.604165093930578e-05, "loss": 0.9734, "mean_token_accuracy": 0.7522595971822739, "num_tokens": 36524228.0, "step": 1120 }, { "entropy": 0.9602563828229904, "epoch": 0.5994652406417113, "grad_norm": 0.1610957682132721, "learning_rate": 8.589136098298185e-05, "loss": 0.9491, "mean_token_accuracy": 0.7566593289375305, "num_tokens": 36556996.0, "step": 1121 }, { "entropy": 0.9304652363061905, "epoch": 0.6, "grad_norm": 0.16996730864048004, "learning_rate": 8.574114341634891e-05, "loss": 0.9285, "mean_token_accuracy": 0.7644611895084381, "num_tokens": 36589742.0, "step": 1122 }, { "entropy": 0.9693291038274765, "epoch": 0.6005347593582888, "grad_norm": 0.15727782249450684, "learning_rate": 8.559099869045706e-05, "loss": 0.9728, "mean_token_accuracy": 0.7531582415103912, "num_tokens": 36622299.0, "step": 1123 }, { "entropy": 0.9516191184520721, "epoch": 0.6010695187165775, "grad_norm": 0.1869746893644333, "learning_rate": 8.544092725613759e-05, "loss": 0.9525, "mean_token_accuracy": 0.7592806667089462, "num_tokens": 36654933.0, "step": 1124 }, { "entropy": 0.9548157751560211, "epoch": 0.6016042780748663, "grad_norm": 0.1933574229478836, "learning_rate": 8.52909295640018e-05, "loss": 0.9587, "mean_token_accuracy": 0.75409334897995, "num_tokens": 36687701.0, "step": 1125 }, { "entropy": 0.9195386320352554, "epoch": 0.6021390374331551, "grad_norm": 0.16644708812236786, "learning_rate": 8.51410060644396e-05, "loss": 0.9236, "mean_token_accuracy": 0.7597140669822693, "num_tokens": 36720469.0, "step": 1126 }, { "entropy": 0.9431671649217606, "epoch": 0.6026737967914438, "grad_norm": 0.1637318879365921, "learning_rate": 8.499115720761802e-05, "loss": 0.9534, "mean_token_accuracy": 0.758542999625206, "num_tokens": 36752930.0, "step": 1127 }, { "entropy": 0.9756807684898376, "epoch": 0.6032085561497326, "grad_norm": 0.15840820968151093, "learning_rate": 8.484138344348002e-05, "loss": 0.9918, "mean_token_accuracy": 0.7510746717453003, "num_tokens": 36785566.0, "step": 1128 }, { "entropy": 0.9955893605947495, "epoch": 0.6037433155080214, "grad_norm": 0.16000495851039886, "learning_rate": 8.469168522174315e-05, "loss": 0.9939, "mean_token_accuracy": 0.7508969306945801, "num_tokens": 36817123.0, "step": 1129 }, { "entropy": 0.9839041531085968, "epoch": 0.6042780748663101, "grad_norm": 0.16458529233932495, "learning_rate": 8.454206299189804e-05, "loss": 0.9871, "mean_token_accuracy": 0.7512162774801254, "num_tokens": 36849688.0, "step": 1130 }, { "entropy": 0.9587539881467819, "epoch": 0.604812834224599, "grad_norm": 0.1586272120475769, "learning_rate": 8.439251720320709e-05, "loss": 0.9582, "mean_token_accuracy": 0.7575050890445709, "num_tokens": 36882114.0, "step": 1131 }, { "entropy": 0.9999642670154572, "epoch": 0.6053475935828877, "grad_norm": 0.18612682819366455, "learning_rate": 8.424304830470335e-05, "loss": 0.9911, "mean_token_accuracy": 0.7502443790435791, "num_tokens": 36914882.0, "step": 1132 }, { "entropy": 0.9600541740655899, "epoch": 0.6058823529411764, "grad_norm": 0.16042885184288025, "learning_rate": 8.409365674518888e-05, "loss": 0.9422, "mean_token_accuracy": 0.7583836913108826, "num_tokens": 36947505.0, "step": 1133 }, { "entropy": 1.0059276521205902, "epoch": 0.6064171122994653, "grad_norm": 0.17258723080158234, "learning_rate": 8.394434297323355e-05, "loss": 1.0066, "mean_token_accuracy": 0.7461808770895004, "num_tokens": 36979947.0, "step": 1134 }, { "entropy": 0.9567270874977112, "epoch": 0.606951871657754, "grad_norm": 0.17004330456256866, "learning_rate": 8.37951074371736e-05, "loss": 0.9575, "mean_token_accuracy": 0.7579072117805481, "num_tokens": 37012622.0, "step": 1135 }, { "entropy": 1.0197716355323792, "epoch": 0.6074866310160428, "grad_norm": 0.16168364882469177, "learning_rate": 8.36459505851105e-05, "loss": 1.0093, "mean_token_accuracy": 0.7469758093357086, "num_tokens": 37045390.0, "step": 1136 }, { "entropy": 1.0205204039812088, "epoch": 0.6080213903743316, "grad_norm": 0.16087409853935242, "learning_rate": 8.349687286490926e-05, "loss": 1.013, "mean_token_accuracy": 0.7450649440288544, "num_tokens": 37078051.0, "step": 1137 }, { "entropy": 0.9923149198293686, "epoch": 0.6085561497326203, "grad_norm": 0.15231212973594666, "learning_rate": 8.334787472419745e-05, "loss": 0.9837, "mean_token_accuracy": 0.7486864626407623, "num_tokens": 37110819.0, "step": 1138 }, { "entropy": 0.9868474304676056, "epoch": 0.6090909090909091, "grad_norm": 0.16185684502124786, "learning_rate": 8.319895661036361e-05, "loss": 0.994, "mean_token_accuracy": 0.7506148368120193, "num_tokens": 37143572.0, "step": 1139 }, { "entropy": 0.9274433553218842, "epoch": 0.6096256684491979, "grad_norm": 0.16928084194660187, "learning_rate": 8.305011897055606e-05, "loss": 0.9355, "mean_token_accuracy": 0.759330689907074, "num_tokens": 37175360.0, "step": 1140 }, { "entropy": 0.9518332779407501, "epoch": 0.6101604278074866, "grad_norm": 0.16280804574489594, "learning_rate": 8.29013622516814e-05, "loss": 0.9511, "mean_token_accuracy": 0.75792196393013, "num_tokens": 37207756.0, "step": 1141 }, { "entropy": 0.9957929700613022, "epoch": 0.6106951871657754, "grad_norm": 0.16850176453590393, "learning_rate": 8.275268690040325e-05, "loss": 0.99, "mean_token_accuracy": 0.7500610947608948, "num_tokens": 37240524.0, "step": 1142 }, { "entropy": 0.9780972301959991, "epoch": 0.6112299465240641, "grad_norm": 0.1570751816034317, "learning_rate": 8.260409336314101e-05, "loss": 0.9725, "mean_token_accuracy": 0.7514051795005798, "num_tokens": 37273292.0, "step": 1143 }, { "entropy": 0.9441059827804565, "epoch": 0.611764705882353, "grad_norm": 0.16775594651699066, "learning_rate": 8.245558208606836e-05, "loss": 0.9442, "mean_token_accuracy": 0.7611830681562424, "num_tokens": 37305117.0, "step": 1144 }, { "entropy": 0.9759251922369003, "epoch": 0.6122994652406417, "grad_norm": 0.16318322718143463, "learning_rate": 8.230715351511193e-05, "loss": 0.9728, "mean_token_accuracy": 0.7533602118492126, "num_tokens": 37337885.0, "step": 1145 }, { "entropy": 1.004443645477295, "epoch": 0.6128342245989304, "grad_norm": 0.16552051901817322, "learning_rate": 8.215880809595014e-05, "loss": 1.0108, "mean_token_accuracy": 0.7459371984004974, "num_tokens": 37370653.0, "step": 1146 }, { "entropy": 0.9759458899497986, "epoch": 0.6133689839572193, "grad_norm": 0.1621161550283432, "learning_rate": 8.201054627401159e-05, "loss": 0.9805, "mean_token_accuracy": 0.7463954091072083, "num_tokens": 37403421.0, "step": 1147 }, { "entropy": 0.9828044027090073, "epoch": 0.613903743315508, "grad_norm": 0.1711197793483734, "learning_rate": 8.186236849447401e-05, "loss": 0.9985, "mean_token_accuracy": 0.749053031206131, "num_tokens": 37436189.0, "step": 1148 }, { "entropy": 0.9460033178329468, "epoch": 0.6144385026737967, "grad_norm": 0.1649860441684723, "learning_rate": 8.171427520226267e-05, "loss": 0.9542, "mean_token_accuracy": 0.7548500895500183, "num_tokens": 37468471.0, "step": 1149 }, { "entropy": 0.9906985908746719, "epoch": 0.6149732620320856, "grad_norm": 0.16224952042102814, "learning_rate": 8.156626684204916e-05, "loss": 1.0017, "mean_token_accuracy": 0.7461031377315521, "num_tokens": 37501131.0, "step": 1150 }, { "entropy": 0.9174692332744598, "epoch": 0.6155080213903743, "grad_norm": 0.17507107555866241, "learning_rate": 8.141834385825014e-05, "loss": 0.921, "mean_token_accuracy": 0.765705406665802, "num_tokens": 37533434.0, "step": 1151 }, { "entropy": 0.9863606989383698, "epoch": 0.6160427807486631, "grad_norm": 0.1530178338289261, "learning_rate": 8.127050669502588e-05, "loss": 0.9774, "mean_token_accuracy": 0.7490722090005875, "num_tokens": 37566153.0, "step": 1152 }, { "entropy": 0.9695703238248825, "epoch": 0.6165775401069519, "grad_norm": 0.1572701632976532, "learning_rate": 8.112275579627883e-05, "loss": 0.9645, "mean_token_accuracy": 0.7534518539905548, "num_tokens": 37598921.0, "step": 1153 }, { "entropy": 0.9552498608827591, "epoch": 0.6171122994652406, "grad_norm": 0.1595485359430313, "learning_rate": 8.097509160565265e-05, "loss": 0.9425, "mean_token_accuracy": 0.7607965171337128, "num_tokens": 37631010.0, "step": 1154 }, { "entropy": 0.9826492518186569, "epoch": 0.6176470588235294, "grad_norm": 0.16088756918907166, "learning_rate": 8.082751456653046e-05, "loss": 0.9722, "mean_token_accuracy": 0.7502389848232269, "num_tokens": 37663680.0, "step": 1155 }, { "entropy": 0.9433806240558624, "epoch": 0.6181818181818182, "grad_norm": 0.1699216216802597, "learning_rate": 8.068002512203375e-05, "loss": 0.9358, "mean_token_accuracy": 0.7620356678962708, "num_tokens": 37696448.0, "step": 1156 }, { "entropy": 0.9715216606855392, "epoch": 0.618716577540107, "grad_norm": 0.18060708045959473, "learning_rate": 8.053262371502102e-05, "loss": 0.9635, "mean_token_accuracy": 0.7556838393211365, "num_tokens": 37729169.0, "step": 1157 }, { "entropy": 1.0064241141080856, "epoch": 0.6192513368983957, "grad_norm": 0.16485314071178436, "learning_rate": 8.03853107880864e-05, "loss": 1.0032, "mean_token_accuracy": 0.749094694852829, "num_tokens": 37761445.0, "step": 1158 }, { "entropy": 0.9281845837831497, "epoch": 0.6197860962566845, "grad_norm": 0.16989584267139435, "learning_rate": 8.023808678355842e-05, "loss": 0.9421, "mean_token_accuracy": 0.7602249681949615, "num_tokens": 37793148.0, "step": 1159 }, { "entropy": 0.9493147134780884, "epoch": 0.6203208556149733, "grad_norm": 0.15808895230293274, "learning_rate": 8.009095214349842e-05, "loss": 0.951, "mean_token_accuracy": 0.7550097703933716, "num_tokens": 37825916.0, "step": 1160 }, { "entropy": 0.9858775585889816, "epoch": 0.620855614973262, "grad_norm": 0.1600574553012848, "learning_rate": 7.994390730969962e-05, "loss": 0.993, "mean_token_accuracy": 0.747996062040329, "num_tokens": 37858567.0, "step": 1161 }, { "entropy": 0.9468472003936768, "epoch": 0.6213903743315508, "grad_norm": 0.18879085779190063, "learning_rate": 7.979695272368549e-05, "loss": 0.9594, "mean_token_accuracy": 0.7577645629644394, "num_tokens": 37890859.0, "step": 1162 }, { "entropy": 0.9671633839607239, "epoch": 0.6219251336898396, "grad_norm": 0.1572384238243103, "learning_rate": 7.96500888267085e-05, "loss": 0.9652, "mean_token_accuracy": 0.7520887106657028, "num_tokens": 37923190.0, "step": 1163 }, { "entropy": 0.9662671089172363, "epoch": 0.6224598930481283, "grad_norm": 0.17573218047618866, "learning_rate": 7.950331605974884e-05, "loss": 0.9652, "mean_token_accuracy": 0.7542543113231659, "num_tokens": 37955830.0, "step": 1164 }, { "entropy": 0.9456570893526077, "epoch": 0.6229946524064172, "grad_norm": 0.15757232904434204, "learning_rate": 7.935663486351306e-05, "loss": 0.9404, "mean_token_accuracy": 0.7586449086666107, "num_tokens": 37988598.0, "step": 1165 }, { "entropy": 0.9794042408466339, "epoch": 0.6235294117647059, "grad_norm": 0.18899321556091309, "learning_rate": 7.921004567843278e-05, "loss": 0.969, "mean_token_accuracy": 0.7550794333219528, "num_tokens": 38020828.0, "step": 1166 }, { "entropy": 0.9964238554239273, "epoch": 0.6240641711229946, "grad_norm": 0.16320250928401947, "learning_rate": 7.906354894466328e-05, "loss": 0.9957, "mean_token_accuracy": 0.7484726309776306, "num_tokens": 38053596.0, "step": 1167 }, { "entropy": 0.9406930059194565, "epoch": 0.6245989304812835, "grad_norm": 0.16028763353824615, "learning_rate": 7.891714510208232e-05, "loss": 0.9464, "mean_token_accuracy": 0.7546432018280029, "num_tokens": 38086364.0, "step": 1168 }, { "entropy": 0.9705571532249451, "epoch": 0.6251336898395722, "grad_norm": 0.1809701919555664, "learning_rate": 7.877083459028872e-05, "loss": 0.9651, "mean_token_accuracy": 0.7501221895217896, "num_tokens": 38119132.0, "step": 1169 }, { "entropy": 0.9514707177877426, "epoch": 0.6256684491978609, "grad_norm": 0.17112714052200317, "learning_rate": 7.862461784860103e-05, "loss": 0.9769, "mean_token_accuracy": 0.7516950070858002, "num_tokens": 38151834.0, "step": 1170 }, { "entropy": 0.9061668813228607, "epoch": 0.6262032085561497, "grad_norm": 0.16375577449798584, "learning_rate": 7.847849531605624e-05, "loss": 0.9032, "mean_token_accuracy": 0.7696725130081177, "num_tokens": 38184602.0, "step": 1171 }, { "entropy": 0.9816283136606216, "epoch": 0.6267379679144385, "grad_norm": 0.2077077478170395, "learning_rate": 7.833246743140852e-05, "loss": 0.9645, "mean_token_accuracy": 0.7513135373592377, "num_tokens": 38217370.0, "step": 1172 }, { "entropy": 0.9717194885015488, "epoch": 0.6272727272727273, "grad_norm": 0.15337511897087097, "learning_rate": 7.81865346331278e-05, "loss": 0.9532, "mean_token_accuracy": 0.754337728023529, "num_tokens": 38250138.0, "step": 1173 }, { "entropy": 0.9285975843667984, "epoch": 0.627807486631016, "grad_norm": 0.17144861817359924, "learning_rate": 7.804069735939844e-05, "loss": 0.9331, "mean_token_accuracy": 0.7599584609270096, "num_tokens": 38282906.0, "step": 1174 }, { "entropy": 0.9945204854011536, "epoch": 0.6283422459893048, "grad_norm": 0.15670542418956757, "learning_rate": 7.78949560481181e-05, "loss": 0.9866, "mean_token_accuracy": 0.7500029057264328, "num_tokens": 38315357.0, "step": 1175 }, { "entropy": 0.9691455066204071, "epoch": 0.6288770053475936, "grad_norm": 0.17279230058193207, "learning_rate": 7.774931113689625e-05, "loss": 0.9767, "mean_token_accuracy": 0.7498340159654617, "num_tokens": 38347759.0, "step": 1176 }, { "entropy": 0.9547702074050903, "epoch": 0.6294117647058823, "grad_norm": 0.21260668337345123, "learning_rate": 7.76037630630529e-05, "loss": 0.9632, "mean_token_accuracy": 0.7575146555900574, "num_tokens": 38380527.0, "step": 1177 }, { "entropy": 0.948182076215744, "epoch": 0.6299465240641712, "grad_norm": 0.15867938101291656, "learning_rate": 7.745831226361724e-05, "loss": 0.9326, "mean_token_accuracy": 0.7615774571895599, "num_tokens": 38413295.0, "step": 1178 }, { "entropy": 0.9786037802696228, "epoch": 0.6304812834224599, "grad_norm": 0.18239183723926544, "learning_rate": 7.731295917532647e-05, "loss": 0.9667, "mean_token_accuracy": 0.7576979398727417, "num_tokens": 38446063.0, "step": 1179 }, { "entropy": 0.928612694144249, "epoch": 0.6310160427807486, "grad_norm": 0.17347387969493866, "learning_rate": 7.716770423462427e-05, "loss": 0.9233, "mean_token_accuracy": 0.7647849321365356, "num_tokens": 38478831.0, "step": 1180 }, { "entropy": 0.9102021008729935, "epoch": 0.6315508021390375, "grad_norm": 0.18702252209186554, "learning_rate": 7.702254787765983e-05, "loss": 0.9119, "mean_token_accuracy": 0.7655161917209625, "num_tokens": 38511575.0, "step": 1181 }, { "entropy": 0.9744768887758255, "epoch": 0.6320855614973262, "grad_norm": 0.17710831761360168, "learning_rate": 7.68774905402861e-05, "loss": 0.9636, "mean_token_accuracy": 0.7528095990419388, "num_tokens": 38544339.0, "step": 1182 }, { "entropy": 0.9430717974901199, "epoch": 0.6326203208556149, "grad_norm": 0.16024550795555115, "learning_rate": 7.673253265805887e-05, "loss": 0.9537, "mean_token_accuracy": 0.7593987137079239, "num_tokens": 38576778.0, "step": 1183 }, { "entropy": 0.9504353106021881, "epoch": 0.6331550802139038, "grad_norm": 0.18097208440303802, "learning_rate": 7.65876746662352e-05, "loss": 0.9438, "mean_token_accuracy": 0.758845865726471, "num_tokens": 38609380.0, "step": 1184 }, { "entropy": 0.9205587655305862, "epoch": 0.6336898395721925, "grad_norm": 0.16049431264400482, "learning_rate": 7.644291699977224e-05, "loss": 0.9256, "mean_token_accuracy": 0.7635595947504044, "num_tokens": 38642042.0, "step": 1185 }, { "entropy": 0.9848443865776062, "epoch": 0.6342245989304813, "grad_norm": 0.16670531034469604, "learning_rate": 7.629826009332597e-05, "loss": 0.9947, "mean_token_accuracy": 0.7503360211849213, "num_tokens": 38674810.0, "step": 1186 }, { "entropy": 0.9391316771507263, "epoch": 0.6347593582887701, "grad_norm": 0.17103585600852966, "learning_rate": 7.615370438124969e-05, "loss": 0.9416, "mean_token_accuracy": 0.7613780200481415, "num_tokens": 38707470.0, "step": 1187 }, { "entropy": 0.9331240952014923, "epoch": 0.6352941176470588, "grad_norm": 0.16136282682418823, "learning_rate": 7.600925029759303e-05, "loss": 0.9263, "mean_token_accuracy": 0.7609174847602844, "num_tokens": 38739709.0, "step": 1188 }, { "entropy": 0.9672888666391373, "epoch": 0.6358288770053476, "grad_norm": 0.16377750039100647, "learning_rate": 7.586489827610024e-05, "loss": 0.9877, "mean_token_accuracy": 0.7512218952178955, "num_tokens": 38772477.0, "step": 1189 }, { "entropy": 0.9343033581972122, "epoch": 0.6363636363636364, "grad_norm": 0.16749276220798492, "learning_rate": 7.572064875020934e-05, "loss": 0.9386, "mean_token_accuracy": 0.7590665519237518, "num_tokens": 38804492.0, "step": 1190 }, { "entropy": 0.956359788775444, "epoch": 0.6368983957219251, "grad_norm": 0.162041574716568, "learning_rate": 7.55765021530504e-05, "loss": 0.9656, "mean_token_accuracy": 0.7557953298091888, "num_tokens": 38837123.0, "step": 1191 }, { "entropy": 0.9705105721950531, "epoch": 0.6374331550802139, "grad_norm": 0.1744755655527115, "learning_rate": 7.543245891744463e-05, "loss": 0.9673, "mean_token_accuracy": 0.7517472356557846, "num_tokens": 38869764.0, "step": 1192 }, { "entropy": 0.9938700348138809, "epoch": 0.6379679144385026, "grad_norm": 0.15463124215602875, "learning_rate": 7.528851947590267e-05, "loss": 0.9856, "mean_token_accuracy": 0.7493914514780045, "num_tokens": 38902435.0, "step": 1193 }, { "entropy": 0.9388508945703506, "epoch": 0.6385026737967915, "grad_norm": 0.16529619693756104, "learning_rate": 7.514468426062362e-05, "loss": 0.9638, "mean_token_accuracy": 0.7549322843551636, "num_tokens": 38935036.0, "step": 1194 }, { "entropy": 0.9566285014152527, "epoch": 0.6390374331550802, "grad_norm": 0.15760692954063416, "learning_rate": 7.500095370349367e-05, "loss": 0.9542, "mean_token_accuracy": 0.7584005296230316, "num_tokens": 38967804.0, "step": 1195 }, { "entropy": 0.9731516391038895, "epoch": 0.6395721925133689, "grad_norm": 0.1565542072057724, "learning_rate": 7.48573282360846e-05, "loss": 0.9766, "mean_token_accuracy": 0.7558712065219879, "num_tokens": 38999949.0, "step": 1196 }, { "entropy": 0.9728382974863052, "epoch": 0.6401069518716578, "grad_norm": 0.15303181111812592, "learning_rate": 7.471380828965282e-05, "loss": 0.9616, "mean_token_accuracy": 0.7509942203760147, "num_tokens": 39032654.0, "step": 1197 }, { "entropy": 0.9958923012018204, "epoch": 0.6406417112299465, "grad_norm": 0.17000418901443481, "learning_rate": 7.457039429513779e-05, "loss": 0.9966, "mean_token_accuracy": 0.747314989566803, "num_tokens": 39065379.0, "step": 1198 }, { "entropy": 0.9914234131574631, "epoch": 0.6411764705882353, "grad_norm": 0.16770699620246887, "learning_rate": 7.442708668316088e-05, "loss": 0.9839, "mean_token_accuracy": 0.7523640841245651, "num_tokens": 39097663.0, "step": 1199 }, { "entropy": 1.0625836253166199, "epoch": 0.6417112299465241, "grad_norm": 0.16445571184158325, "learning_rate": 7.428388588402397e-05, "loss": 1.052, "mean_token_accuracy": 0.7347568422555923, "num_tokens": 39129868.0, "step": 1200 }, { "entropy": 0.9884249269962311, "epoch": 0.6422459893048128, "grad_norm": 0.16170428693294525, "learning_rate": 7.414079232770832e-05, "loss": 0.9735, "mean_token_accuracy": 0.7510308921337128, "num_tokens": 39162555.0, "step": 1201 }, { "entropy": 0.9557610899209976, "epoch": 0.6427807486631016, "grad_norm": 0.1583389937877655, "learning_rate": 7.399780644387308e-05, "loss": 0.9521, "mean_token_accuracy": 0.7558740526437759, "num_tokens": 39195316.0, "step": 1202 }, { "entropy": 1.0001453161239624, "epoch": 0.6433155080213904, "grad_norm": 0.16569454967975616, "learning_rate": 7.385492866185418e-05, "loss": 0.9889, "mean_token_accuracy": 0.7535740435123444, "num_tokens": 39228084.0, "step": 1203 }, { "entropy": 0.9466915130615234, "epoch": 0.6438502673796791, "grad_norm": 0.17218825221061707, "learning_rate": 7.371215941066285e-05, "loss": 0.9328, "mean_token_accuracy": 0.7594085931777954, "num_tokens": 39260852.0, "step": 1204 }, { "entropy": 0.9893372505903244, "epoch": 0.6443850267379679, "grad_norm": 0.1548699587583542, "learning_rate": 7.356949911898456e-05, "loss": 0.9718, "mean_token_accuracy": 0.7504263818264008, "num_tokens": 39293542.0, "step": 1205 }, { "entropy": 0.951935887336731, "epoch": 0.6449197860962567, "grad_norm": 0.1720178872346878, "learning_rate": 7.342694821517757e-05, "loss": 0.9394, "mean_token_accuracy": 0.7647266685962677, "num_tokens": 39326010.0, "step": 1206 }, { "entropy": 0.9583040624856949, "epoch": 0.6454545454545455, "grad_norm": 0.1612015664577484, "learning_rate": 7.328450712727161e-05, "loss": 0.9564, "mean_token_accuracy": 0.7522828131914139, "num_tokens": 39358252.0, "step": 1207 }, { "entropy": 0.9418720752000809, "epoch": 0.6459893048128342, "grad_norm": 0.1865474283695221, "learning_rate": 7.314217628296682e-05, "loss": 0.9446, "mean_token_accuracy": 0.7590716779232025, "num_tokens": 39390549.0, "step": 1208 }, { "entropy": 0.9319218695163727, "epoch": 0.646524064171123, "grad_norm": 0.17535893619060516, "learning_rate": 7.299995610963215e-05, "loss": 0.9425, "mean_token_accuracy": 0.7570180594921112, "num_tokens": 39423027.0, "step": 1209 }, { "entropy": 0.9778590053319931, "epoch": 0.6470588235294118, "grad_norm": 0.1859198808670044, "learning_rate": 7.285784703430446e-05, "loss": 0.9955, "mean_token_accuracy": 0.7506783753633499, "num_tokens": 39454976.0, "step": 1210 }, { "entropy": 0.9831230938434601, "epoch": 0.6475935828877005, "grad_norm": 0.16050094366073608, "learning_rate": 7.271584948368677e-05, "loss": 1.002, "mean_token_accuracy": 0.7491774410009384, "num_tokens": 39487666.0, "step": 1211 }, { "entropy": 0.9467550218105316, "epoch": 0.6481283422459893, "grad_norm": 0.16806411743164062, "learning_rate": 7.257396388414737e-05, "loss": 0.9641, "mean_token_accuracy": 0.760325014591217, "num_tokens": 39520434.0, "step": 1212 }, { "entropy": 0.960830956697464, "epoch": 0.6486631016042781, "grad_norm": 0.16417725384235382, "learning_rate": 7.243219066171847e-05, "loss": 0.9686, "mean_token_accuracy": 0.7505804002285004, "num_tokens": 39553202.0, "step": 1213 }, { "entropy": 0.9331855922937393, "epoch": 0.6491978609625668, "grad_norm": 0.19479550421237946, "learning_rate": 7.229053024209467e-05, "loss": 0.9411, "mean_token_accuracy": 0.7603546977043152, "num_tokens": 39585826.0, "step": 1214 }, { "entropy": 0.9689950495958328, "epoch": 0.6497326203208557, "grad_norm": 0.16452471911907196, "learning_rate": 7.2148983050632e-05, "loss": 0.9651, "mean_token_accuracy": 0.7540017068386078, "num_tokens": 39618594.0, "step": 1215 }, { "entropy": 0.9832485765218735, "epoch": 0.6502673796791444, "grad_norm": 0.1860886961221695, "learning_rate": 7.200754951234649e-05, "loss": 0.9847, "mean_token_accuracy": 0.7540322542190552, "num_tokens": 39651362.0, "step": 1216 }, { "entropy": 0.9701773524284363, "epoch": 0.6508021390374331, "grad_norm": 0.17255829274654388, "learning_rate": 7.186623005191288e-05, "loss": 0.9622, "mean_token_accuracy": 0.7551049739122391, "num_tokens": 39683730.0, "step": 1217 }, { "entropy": 1.0146152973175049, "epoch": 0.651336898395722, "grad_norm": 0.24694116413593292, "learning_rate": 7.172502509366332e-05, "loss": 1.0029, "mean_token_accuracy": 0.7463648617267609, "num_tokens": 39716498.0, "step": 1218 }, { "entropy": 0.9297868311405182, "epoch": 0.6518716577540107, "grad_norm": 0.15562401711940765, "learning_rate": 7.158393506158622e-05, "loss": 0.9221, "mean_token_accuracy": 0.7655511498451233, "num_tokens": 39749029.0, "step": 1219 }, { "entropy": 0.9979990869760513, "epoch": 0.6524064171122995, "grad_norm": 0.17048972845077515, "learning_rate": 7.144296037932496e-05, "loss": 0.9864, "mean_token_accuracy": 0.7465503215789795, "num_tokens": 39781597.0, "step": 1220 }, { "entropy": 0.9215275347232819, "epoch": 0.6529411764705882, "grad_norm": 0.18462637066841125, "learning_rate": 7.13021014701765e-05, "loss": 0.9164, "mean_token_accuracy": 0.7666177749633789, "num_tokens": 39814365.0, "step": 1221 }, { "entropy": 0.9937931597232819, "epoch": 0.653475935828877, "grad_norm": 0.2151746302843094, "learning_rate": 7.116135875709011e-05, "loss": 1.0027, "mean_token_accuracy": 0.7481340318918228, "num_tokens": 39846886.0, "step": 1222 }, { "entropy": 1.022530347108841, "epoch": 0.6540106951871658, "grad_norm": 0.16658732295036316, "learning_rate": 7.102073266266632e-05, "loss": 1.0072, "mean_token_accuracy": 0.7454576939344406, "num_tokens": 39879205.0, "step": 1223 }, { "entropy": 0.9836557805538177, "epoch": 0.6545454545454545, "grad_norm": 0.1658448576927185, "learning_rate": 7.088022360915536e-05, "loss": 0.9754, "mean_token_accuracy": 0.7512788772583008, "num_tokens": 39911547.0, "step": 1224 }, { "entropy": 0.9708831012248993, "epoch": 0.6550802139037433, "grad_norm": 0.1592826098203659, "learning_rate": 7.073983201845602e-05, "loss": 0.9737, "mean_token_accuracy": 0.7533071786165237, "num_tokens": 39944224.0, "step": 1225 }, { "entropy": 0.9460985213518143, "epoch": 0.6556149732620321, "grad_norm": 0.16502933204174042, "learning_rate": 7.059955831211448e-05, "loss": 0.9443, "mean_token_accuracy": 0.757226288318634, "num_tokens": 39976628.0, "step": 1226 }, { "entropy": 0.9379078596830368, "epoch": 0.6561497326203208, "grad_norm": 0.16671308875083923, "learning_rate": 7.0459402911323e-05, "loss": 0.9378, "mean_token_accuracy": 0.7584051489830017, "num_tokens": 40009023.0, "step": 1227 }, { "entropy": 0.9769723415374756, "epoch": 0.6566844919786097, "grad_norm": 0.1969531625509262, "learning_rate": 7.031936623691848e-05, "loss": 0.9783, "mean_token_accuracy": 0.7504943460226059, "num_tokens": 40041466.0, "step": 1228 }, { "entropy": 1.0261331349611282, "epoch": 0.6572192513368984, "grad_norm": 0.18100857734680176, "learning_rate": 7.017944870938133e-05, "loss": 1.0133, "mean_token_accuracy": 0.745662271976471, "num_tokens": 40074234.0, "step": 1229 }, { "entropy": 0.9616502523422241, "epoch": 0.6577540106951871, "grad_norm": 0.18322017788887024, "learning_rate": 7.003965074883433e-05, "loss": 0.9778, "mean_token_accuracy": 0.7553763389587402, "num_tokens": 40107002.0, "step": 1230 }, { "entropy": 0.942153662443161, "epoch": 0.658288770053476, "grad_norm": 0.20072796940803528, "learning_rate": 6.98999727750411e-05, "loss": 0.9439, "mean_token_accuracy": 0.7598668038845062, "num_tokens": 40139770.0, "step": 1231 }, { "entropy": 0.9993161708116531, "epoch": 0.6588235294117647, "grad_norm": 0.16346585750579834, "learning_rate": 6.976041520740513e-05, "loss": 0.99, "mean_token_accuracy": 0.7468269765377045, "num_tokens": 40172388.0, "step": 1232 }, { "entropy": 0.9855088144540787, "epoch": 0.6593582887700534, "grad_norm": 0.16769243776798248, "learning_rate": 6.962097846496818e-05, "loss": 0.9849, "mean_token_accuracy": 0.7512524276971817, "num_tokens": 40205156.0, "step": 1233 }, { "entropy": 0.9597685039043427, "epoch": 0.6598930481283423, "grad_norm": 0.22894522547721863, "learning_rate": 6.948166296640939e-05, "loss": 0.9432, "mean_token_accuracy": 0.7628034800291061, "num_tokens": 40237572.0, "step": 1234 }, { "entropy": 1.005010411143303, "epoch": 0.660427807486631, "grad_norm": 0.17149806022644043, "learning_rate": 6.934246913004383e-05, "loss": 1.0129, "mean_token_accuracy": 0.7463954091072083, "num_tokens": 40270340.0, "step": 1235 }, { "entropy": 0.9686301201581955, "epoch": 0.6609625668449198, "grad_norm": 0.1712532788515091, "learning_rate": 6.920339737382111e-05, "loss": 0.9731, "mean_token_accuracy": 0.7489613890647888, "num_tokens": 40303108.0, "step": 1236 }, { "entropy": 0.9406851232051849, "epoch": 0.6614973262032086, "grad_norm": 0.1674990952014923, "learning_rate": 6.906444811532447e-05, "loss": 0.9435, "mean_token_accuracy": 0.7626807540655136, "num_tokens": 40335456.0, "step": 1237 }, { "entropy": 0.9710539728403091, "epoch": 0.6620320855614973, "grad_norm": 0.16204853355884552, "learning_rate": 6.892562177176918e-05, "loss": 0.9718, "mean_token_accuracy": 0.7541085183620453, "num_tokens": 40368044.0, "step": 1238 }, { "entropy": 0.9809377044439316, "epoch": 0.6625668449197861, "grad_norm": 0.1876448392868042, "learning_rate": 6.878691876000158e-05, "loss": 0.9988, "mean_token_accuracy": 0.7467008829116821, "num_tokens": 40400812.0, "step": 1239 }, { "entropy": 0.937059685587883, "epoch": 0.6631016042780749, "grad_norm": 0.1668710559606552, "learning_rate": 6.864833949649752e-05, "loss": 0.9448, "mean_token_accuracy": 0.7630942314863205, "num_tokens": 40433142.0, "step": 1240 }, { "entropy": 0.9396518021821976, "epoch": 0.6636363636363637, "grad_norm": 0.1635650247335434, "learning_rate": 6.85098843973614e-05, "loss": 0.928, "mean_token_accuracy": 0.7608862668275833, "num_tokens": 40465764.0, "step": 1241 }, { "entropy": 0.9529020488262177, "epoch": 0.6641711229946524, "grad_norm": 0.17254072427749634, "learning_rate": 6.837155387832485e-05, "loss": 0.961, "mean_token_accuracy": 0.7572091817855835, "num_tokens": 40498532.0, "step": 1242 }, { "entropy": 0.9303242862224579, "epoch": 0.6647058823529411, "grad_norm": 0.1784360408782959, "learning_rate": 6.823334835474523e-05, "loss": 0.9358, "mean_token_accuracy": 0.7638181447982788, "num_tokens": 40531218.0, "step": 1243 }, { "entropy": 0.8833232223987579, "epoch": 0.66524064171123, "grad_norm": 0.17049525678157806, "learning_rate": 6.809526824160477e-05, "loss": 0.8735, "mean_token_accuracy": 0.7731898725032806, "num_tokens": 40563858.0, "step": 1244 }, { "entropy": 0.9500382095575333, "epoch": 0.6657754010695187, "grad_norm": 0.21515445411205292, "learning_rate": 6.795731395350908e-05, "loss": 0.9341, "mean_token_accuracy": 0.7637630701065063, "num_tokens": 40596268.0, "step": 1245 }, { "entropy": 0.9598138481378555, "epoch": 0.6663101604278074, "grad_norm": 0.1596616506576538, "learning_rate": 6.781948590468592e-05, "loss": 0.9531, "mean_token_accuracy": 0.7549792230129242, "num_tokens": 40629036.0, "step": 1246 }, { "entropy": 0.9623348414897919, "epoch": 0.6668449197860963, "grad_norm": 0.1683739870786667, "learning_rate": 6.768178450898401e-05, "loss": 0.9509, "mean_token_accuracy": 0.7548241764307022, "num_tokens": 40661776.0, "step": 1247 }, { "entropy": 0.9414125084877014, "epoch": 0.667379679144385, "grad_norm": 0.1648664027452469, "learning_rate": 6.754421017987181e-05, "loss": 0.938, "mean_token_accuracy": 0.7581561654806137, "num_tokens": 40694544.0, "step": 1248 }, { "entropy": 0.9510121196508408, "epoch": 0.6679144385026738, "grad_norm": 0.17633424699306488, "learning_rate": 6.740676333043627e-05, "loss": 0.95, "mean_token_accuracy": 0.7547886073589325, "num_tokens": 40726790.0, "step": 1249 }, { "entropy": 0.9642129242420197, "epoch": 0.6684491978609626, "grad_norm": 0.17034272849559784, "learning_rate": 6.72694443733815e-05, "loss": 0.9682, "mean_token_accuracy": 0.756823718547821, "num_tokens": 40758825.0, "step": 1250 }, { "entropy": 0.9768649190664291, "epoch": 0.6689839572192513, "grad_norm": 0.1960660219192505, "learning_rate": 6.713225372102757e-05, "loss": 0.9789, "mean_token_accuracy": 0.7500343322753906, "num_tokens": 40791537.0, "step": 1251 }, { "entropy": 0.9842686802148819, "epoch": 0.6695187165775401, "grad_norm": 0.19921888411045074, "learning_rate": 6.699519178530944e-05, "loss": 0.9889, "mean_token_accuracy": 0.7502443790435791, "num_tokens": 40824305.0, "step": 1252 }, { "entropy": 0.9305083900690079, "epoch": 0.6700534759358289, "grad_norm": 0.1651991307735443, "learning_rate": 6.685825897777541e-05, "loss": 0.9372, "mean_token_accuracy": 0.7601883560419083, "num_tokens": 40856955.0, "step": 1253 }, { "entropy": 0.9830394834280014, "epoch": 0.6705882352941176, "grad_norm": 0.1739017516374588, "learning_rate": 6.672145570958613e-05, "loss": 0.9977, "mean_token_accuracy": 0.7495644688606262, "num_tokens": 40889555.0, "step": 1254 }, { "entropy": 0.9631999880075455, "epoch": 0.6711229946524064, "grad_norm": 0.17304059863090515, "learning_rate": 6.65847823915133e-05, "loss": 0.9629, "mean_token_accuracy": 0.7510248422622681, "num_tokens": 40922091.0, "step": 1255 }, { "entropy": 0.9633833467960358, "epoch": 0.6716577540106952, "grad_norm": 0.17870648205280304, "learning_rate": 6.644823943393839e-05, "loss": 0.9695, "mean_token_accuracy": 0.7529020011425018, "num_tokens": 40954859.0, "step": 1256 }, { "entropy": 0.9755293428897858, "epoch": 0.672192513368984, "grad_norm": 0.16337575018405914, "learning_rate": 6.631182724685153e-05, "loss": 0.9649, "mean_token_accuracy": 0.7519892007112503, "num_tokens": 40987616.0, "step": 1257 }, { "entropy": 0.963219404220581, "epoch": 0.6727272727272727, "grad_norm": 0.1670699417591095, "learning_rate": 6.617554623985006e-05, "loss": 0.9572, "mean_token_accuracy": 0.7563893646001816, "num_tokens": 41020261.0, "step": 1258 }, { "entropy": 0.9529228955507278, "epoch": 0.6732620320855615, "grad_norm": 0.17823971807956696, "learning_rate": 6.603939682213756e-05, "loss": 0.9515, "mean_token_accuracy": 0.7584005296230316, "num_tokens": 41053029.0, "step": 1259 }, { "entropy": 0.9679936170578003, "epoch": 0.6737967914438503, "grad_norm": 0.16420431435108185, "learning_rate": 6.590337940252237e-05, "loss": 0.9687, "mean_token_accuracy": 0.7551368027925491, "num_tokens": 41085587.0, "step": 1260 }, { "entropy": 0.9809670001268387, "epoch": 0.674331550802139, "grad_norm": 0.16018334031105042, "learning_rate": 6.576749438941665e-05, "loss": 0.9758, "mean_token_accuracy": 0.7510380744934082, "num_tokens": 41118115.0, "step": 1261 }, { "entropy": 0.9639696776866913, "epoch": 0.6748663101604279, "grad_norm": 0.16860608756542206, "learning_rate": 6.563174219083476e-05, "loss": 0.9555, "mean_token_accuracy": 0.7552216500043869, "num_tokens": 41150743.0, "step": 1262 }, { "entropy": 1.0253010541200638, "epoch": 0.6754010695187166, "grad_norm": 0.16424211859703064, "learning_rate": 6.549612321439249e-05, "loss": 1.0096, "mean_token_accuracy": 0.7431881278753281, "num_tokens": 41183310.0, "step": 1263 }, { "entropy": 0.9711318612098694, "epoch": 0.6759358288770053, "grad_norm": 0.16268914937973022, "learning_rate": 6.536063786730554e-05, "loss": 0.9595, "mean_token_accuracy": 0.759953111410141, "num_tokens": 41215825.0, "step": 1264 }, { "entropy": 0.9563075453042984, "epoch": 0.6764705882352942, "grad_norm": 0.16132086515426636, "learning_rate": 6.522528655638824e-05, "loss": 0.9445, "mean_token_accuracy": 0.7609447985887527, "num_tokens": 41248582.0, "step": 1265 }, { "entropy": 0.9650041162967682, "epoch": 0.6770053475935829, "grad_norm": 0.1783258467912674, "learning_rate": 6.50900696880527e-05, "loss": 0.9733, "mean_token_accuracy": 0.7552541494369507, "num_tokens": 41281350.0, "step": 1266 }, { "entropy": 0.9366767555475235, "epoch": 0.6775401069518716, "grad_norm": 0.17735034227371216, "learning_rate": 6.495498766830706e-05, "loss": 0.9392, "mean_token_accuracy": 0.7608137875795364, "num_tokens": 41314118.0, "step": 1267 }, { "entropy": 0.9627933204174042, "epoch": 0.6780748663101605, "grad_norm": 0.16228435933589935, "learning_rate": 6.482004090275484e-05, "loss": 0.9575, "mean_token_accuracy": 0.7606304883956909, "num_tokens": 41346886.0, "step": 1268 }, { "entropy": 0.935073509812355, "epoch": 0.6786096256684492, "grad_norm": 0.19033820927143097, "learning_rate": 6.468522979659322e-05, "loss": 0.9283, "mean_token_accuracy": 0.763078510761261, "num_tokens": 41378997.0, "step": 1269 }, { "entropy": 0.9309537261724472, "epoch": 0.679144385026738, "grad_norm": 0.17397263646125793, "learning_rate": 6.455055475461216e-05, "loss": 0.9311, "mean_token_accuracy": 0.7598057091236115, "num_tokens": 41411765.0, "step": 1270 }, { "entropy": 1.0121001154184341, "epoch": 0.6796791443850267, "grad_norm": 0.16641665995121002, "learning_rate": 6.441601618119308e-05, "loss": 1.0186, "mean_token_accuracy": 0.7441959977149963, "num_tokens": 41444533.0, "step": 1271 }, { "entropy": 0.9392533451318741, "epoch": 0.6802139037433155, "grad_norm": 0.18284034729003906, "learning_rate": 6.428161448030751e-05, "loss": 0.9229, "mean_token_accuracy": 0.7665873914957047, "num_tokens": 41477064.0, "step": 1272 }, { "entropy": 0.9318648427724838, "epoch": 0.6807486631016043, "grad_norm": 0.16083700954914093, "learning_rate": 6.414735005551617e-05, "loss": 0.9361, "mean_token_accuracy": 0.7590725719928741, "num_tokens": 41509832.0, "step": 1273 }, { "entropy": 1.0033331364393234, "epoch": 0.681283422459893, "grad_norm": 0.16821704804897308, "learning_rate": 6.401322330996743e-05, "loss": 1.0041, "mean_token_accuracy": 0.7469452619552612, "num_tokens": 41542600.0, "step": 1274 }, { "entropy": 0.9760285168886185, "epoch": 0.6818181818181818, "grad_norm": 0.18433675169944763, "learning_rate": 6.387923464639638e-05, "loss": 0.9945, "mean_token_accuracy": 0.7464347928762436, "num_tokens": 41575072.0, "step": 1275 }, { "entropy": 0.9680602550506592, "epoch": 0.6823529411764706, "grad_norm": 0.16477592289447784, "learning_rate": 6.374538446712339e-05, "loss": 0.9671, "mean_token_accuracy": 0.7558453530073166, "num_tokens": 41607602.0, "step": 1276 }, { "entropy": 0.9561006724834442, "epoch": 0.6828877005347593, "grad_norm": 0.16161902248859406, "learning_rate": 6.361167317405309e-05, "loss": 0.9568, "mean_token_accuracy": 0.758235827088356, "num_tokens": 41640162.0, "step": 1277 }, { "entropy": 0.9576833844184875, "epoch": 0.6834224598930482, "grad_norm": 0.16424843668937683, "learning_rate": 6.34781011686731e-05, "loss": 0.9579, "mean_token_accuracy": 0.7535740435123444, "num_tokens": 41672930.0, "step": 1278 }, { "entropy": 0.96244677901268, "epoch": 0.6839572192513369, "grad_norm": 0.1593867689371109, "learning_rate": 6.334466885205275e-05, "loss": 0.9673, "mean_token_accuracy": 0.7590638697147369, "num_tokens": 41705393.0, "step": 1279 }, { "entropy": 1.010896772146225, "epoch": 0.6844919786096256, "grad_norm": 0.1620362401008606, "learning_rate": 6.321137662484195e-05, "loss": 1.0045, "mean_token_accuracy": 0.7459371984004974, "num_tokens": 41738161.0, "step": 1280 }, { "entropy": 0.9322298914194107, "epoch": 0.6850267379679145, "grad_norm": 0.16803748905658722, "learning_rate": 6.307822488726996e-05, "loss": 0.9421, "mean_token_accuracy": 0.7560789287090302, "num_tokens": 41770929.0, "step": 1281 }, { "entropy": 0.9569824635982513, "epoch": 0.6855614973262032, "grad_norm": 0.16779689490795135, "learning_rate": 6.294521403914425e-05, "loss": 0.9551, "mean_token_accuracy": 0.7504887580871582, "num_tokens": 41803697.0, "step": 1282 }, { "entropy": 0.973842054605484, "epoch": 0.686096256684492, "grad_norm": 0.16040951013565063, "learning_rate": 6.281234447984917e-05, "loss": 0.9628, "mean_token_accuracy": 0.7519855797290802, "num_tokens": 41836465.0, "step": 1283 }, { "entropy": 0.9762571305036545, "epoch": 0.6866310160427808, "grad_norm": 0.16406090557575226, "learning_rate": 6.26796166083449e-05, "loss": 0.9739, "mean_token_accuracy": 0.7509875148534775, "num_tokens": 41869136.0, "step": 1284 }, { "entropy": 0.9796842485666275, "epoch": 0.6871657754010695, "grad_norm": 0.16604503989219666, "learning_rate": 6.254703082316613e-05, "loss": 0.9696, "mean_token_accuracy": 0.7559634149074554, "num_tokens": 41901703.0, "step": 1285 }, { "entropy": 0.9618323147296906, "epoch": 0.6877005347593583, "grad_norm": 0.16067545115947723, "learning_rate": 6.241458752242102e-05, "loss": 0.9503, "mean_token_accuracy": 0.7568731606006622, "num_tokens": 41934471.0, "step": 1286 }, { "entropy": 0.9434874653816223, "epoch": 0.6882352941176471, "grad_norm": 0.16011638939380646, "learning_rate": 6.228228710378976e-05, "loss": 0.9432, "mean_token_accuracy": 0.7582783401012421, "num_tokens": 41967239.0, "step": 1287 }, { "entropy": 1.0110597610473633, "epoch": 0.6887700534759358, "grad_norm": 0.17058852314949036, "learning_rate": 6.215012996452354e-05, "loss": 1.0262, "mean_token_accuracy": 0.745509535074234, "num_tokens": 42000007.0, "step": 1288 }, { "entropy": 0.9647829830646515, "epoch": 0.6893048128342246, "grad_norm": 0.16275008022785187, "learning_rate": 6.201811650144342e-05, "loss": 0.9663, "mean_token_accuracy": 0.7523216009140015, "num_tokens": 42032775.0, "step": 1289 }, { "entropy": 0.939326599240303, "epoch": 0.6898395721925134, "grad_norm": 0.16183090209960938, "learning_rate": 6.188624711093905e-05, "loss": 0.9302, "mean_token_accuracy": 0.7647543996572495, "num_tokens": 42065543.0, "step": 1290 }, { "entropy": 0.9778326153755188, "epoch": 0.6903743315508022, "grad_norm": 0.16926462948322296, "learning_rate": 6.175452218896731e-05, "loss": 0.9935, "mean_token_accuracy": 0.7510386109352112, "num_tokens": 42098311.0, "step": 1291 }, { "entropy": 1.0116349458694458, "epoch": 0.6909090909090909, "grad_norm": 0.16622833907604218, "learning_rate": 6.162294213105149e-05, "loss": 1.0144, "mean_token_accuracy": 0.744976207613945, "num_tokens": 42130923.0, "step": 1292 }, { "entropy": 0.9861657917499542, "epoch": 0.6914438502673796, "grad_norm": 0.16684715449810028, "learning_rate": 6.149150733227986e-05, "loss": 0.9918, "mean_token_accuracy": 0.7465481460094452, "num_tokens": 42163691.0, "step": 1293 }, { "entropy": 0.8746530562639236, "epoch": 0.6919786096256685, "grad_norm": 0.16398267447948456, "learning_rate": 6.136021818730441e-05, "loss": 0.87, "mean_token_accuracy": 0.7736437022686005, "num_tokens": 42196459.0, "step": 1294 }, { "entropy": 0.9123699069023132, "epoch": 0.6925133689839572, "grad_norm": 0.18755117058753967, "learning_rate": 6.122907509033997e-05, "loss": 0.9109, "mean_token_accuracy": 0.7674476206302643, "num_tokens": 42228914.0, "step": 1295 }, { "entropy": 0.9567636549472809, "epoch": 0.6930481283422459, "grad_norm": 0.16553348302841187, "learning_rate": 6.109807843516264e-05, "loss": 0.9483, "mean_token_accuracy": 0.7542024850845337, "num_tokens": 42261592.0, "step": 1296 }, { "entropy": 0.9956474155187607, "epoch": 0.6935828877005348, "grad_norm": 0.17825832962989807, "learning_rate": 6.096722861510898e-05, "loss": 1.0149, "mean_token_accuracy": 0.7447462528944016, "num_tokens": 42294101.0, "step": 1297 }, { "entropy": 0.9219067543745041, "epoch": 0.6941176470588235, "grad_norm": 0.1673230230808258, "learning_rate": 6.083652602307455e-05, "loss": 0.9053, "mean_token_accuracy": 0.7665437757968903, "num_tokens": 42326639.0, "step": 1298 }, { "entropy": 0.9638742357492447, "epoch": 0.6946524064171123, "grad_norm": 0.17454005777835846, "learning_rate": 6.0705971051512856e-05, "loss": 0.9518, "mean_token_accuracy": 0.7599993348121643, "num_tokens": 42359065.0, "step": 1299 }, { "entropy": 1.0120955258607864, "epoch": 0.6951871657754011, "grad_norm": 0.17239855229854584, "learning_rate": 6.057556409243424e-05, "loss": 1.0185, "mean_token_accuracy": 0.7423326075077057, "num_tokens": 42391833.0, "step": 1300 }, { "entropy": 1.004928171634674, "epoch": 0.6957219251336898, "grad_norm": 0.16988736391067505, "learning_rate": 6.0445305537404427e-05, "loss": 0.9945, "mean_token_accuracy": 0.7474340200424194, "num_tokens": 42424601.0, "step": 1301 }, { "entropy": 0.9260279536247253, "epoch": 0.6962566844919786, "grad_norm": 0.19346578419208527, "learning_rate": 6.031519577754377e-05, "loss": 0.9218, "mean_token_accuracy": 0.7631844580173492, "num_tokens": 42456794.0, "step": 1302 }, { "entropy": 0.9603572189807892, "epoch": 0.6967914438502674, "grad_norm": 0.16860198974609375, "learning_rate": 6.0185235203525636e-05, "loss": 0.9448, "mean_token_accuracy": 0.7589503824710846, "num_tokens": 42489562.0, "step": 1303 }, { "entropy": 0.9546780437231064, "epoch": 0.6973262032085561, "grad_norm": 0.1686096489429474, "learning_rate": 6.0055424205575594e-05, "loss": 0.9671, "mean_token_accuracy": 0.7567043155431747, "num_tokens": 42522264.0, "step": 1304 }, { "entropy": 0.9691311120986938, "epoch": 0.6978609625668449, "grad_norm": 0.21208062767982483, "learning_rate": 5.992576317346995e-05, "loss": 0.9512, "mean_token_accuracy": 0.7584279030561447, "num_tokens": 42554963.0, "step": 1305 }, { "entropy": 0.9300960153341293, "epoch": 0.6983957219251337, "grad_norm": 0.1617356240749359, "learning_rate": 5.9796252496534845e-05, "loss": 0.9162, "mean_token_accuracy": 0.7626371830701828, "num_tokens": 42587269.0, "step": 1306 }, { "entropy": 0.9444008618593216, "epoch": 0.6989304812834225, "grad_norm": 0.1645589917898178, "learning_rate": 5.966689256364486e-05, "loss": 0.9397, "mean_token_accuracy": 0.7619356215000153, "num_tokens": 42619692.0, "step": 1307 }, { "entropy": 0.9678683280944824, "epoch": 0.6994652406417112, "grad_norm": 0.18692892789840698, "learning_rate": 5.9537683763222064e-05, "loss": 0.9769, "mean_token_accuracy": 0.7515884637832642, "num_tokens": 42652460.0, "step": 1308 }, { "entropy": 0.9600773751735687, "epoch": 0.7, "grad_norm": 0.17012649774551392, "learning_rate": 5.94086264832346e-05, "loss": 0.9561, "mean_token_accuracy": 0.7551014125347137, "num_tokens": 42685228.0, "step": 1309 }, { "entropy": 0.9367721974849701, "epoch": 0.7005347593582888, "grad_norm": 0.1679380238056183, "learning_rate": 5.927972111119565e-05, "loss": 0.9229, "mean_token_accuracy": 0.7605284154415131, "num_tokens": 42717805.0, "step": 1310 }, { "entropy": 0.9343129098415375, "epoch": 0.7010695187165775, "grad_norm": 0.21657076478004456, "learning_rate": 5.915096803416243e-05, "loss": 0.9427, "mean_token_accuracy": 0.7589403390884399, "num_tokens": 42750378.0, "step": 1311 }, { "entropy": 0.9220041334629059, "epoch": 0.7016042780748664, "grad_norm": 0.19568952918052673, "learning_rate": 5.902236763873463e-05, "loss": 0.9127, "mean_token_accuracy": 0.7643386423587799, "num_tokens": 42782938.0, "step": 1312 }, { "entropy": 0.9896649420261383, "epoch": 0.7021390374331551, "grad_norm": 0.1709054410457611, "learning_rate": 5.889392031105371e-05, "loss": 0.9861, "mean_token_accuracy": 0.7547959536314011, "num_tokens": 42815706.0, "step": 1313 }, { "entropy": 0.9917427897453308, "epoch": 0.7026737967914438, "grad_norm": 0.1694716513156891, "learning_rate": 5.876562643680139e-05, "loss": 0.9975, "mean_token_accuracy": 0.7487170100212097, "num_tokens": 42848474.0, "step": 1314 }, { "entropy": 0.9077127426862717, "epoch": 0.7032085561497327, "grad_norm": 0.18303543329238892, "learning_rate": 5.8637486401198725e-05, "loss": 0.913, "mean_token_accuracy": 0.7661791741847992, "num_tokens": 42880895.0, "step": 1315 }, { "entropy": 0.9104649573564529, "epoch": 0.7037433155080214, "grad_norm": 0.20730172097682953, "learning_rate": 5.850950058900475e-05, "loss": 0.9204, "mean_token_accuracy": 0.7629520893096924, "num_tokens": 42913663.0, "step": 1316 }, { "entropy": 0.965117797255516, "epoch": 0.7042780748663101, "grad_norm": 0.15987767279148102, "learning_rate": 5.838166938451542e-05, "loss": 0.9575, "mean_token_accuracy": 0.7545472979545593, "num_tokens": 42946046.0, "step": 1317 }, { "entropy": 0.9923857301473618, "epoch": 0.704812834224599, "grad_norm": 0.27166005969047546, "learning_rate": 5.825399317156253e-05, "loss": 0.9909, "mean_token_accuracy": 0.746456503868103, "num_tokens": 42978814.0, "step": 1318 }, { "entropy": 1.0188444405794144, "epoch": 0.7053475935828877, "grad_norm": 0.1819809079170227, "learning_rate": 5.812647233351249e-05, "loss": 1.0373, "mean_token_accuracy": 0.7383614480495453, "num_tokens": 43011582.0, "step": 1319 }, { "entropy": 0.9351516962051392, "epoch": 0.7058823529411765, "grad_norm": 0.19748146831989288, "learning_rate": 5.7999107253265085e-05, "loss": 0.9504, "mean_token_accuracy": 0.7600806355476379, "num_tokens": 43044350.0, "step": 1320 }, { "entropy": 0.9284085929393768, "epoch": 0.7064171122994652, "grad_norm": 0.17606669664382935, "learning_rate": 5.787189831325248e-05, "loss": 0.9315, "mean_token_accuracy": 0.7659762799739838, "num_tokens": 43077118.0, "step": 1321 }, { "entropy": 0.9412466883659363, "epoch": 0.706951871657754, "grad_norm": 0.17121106386184692, "learning_rate": 5.774484589543805e-05, "loss": 0.9439, "mean_token_accuracy": 0.7576881498098373, "num_tokens": 43109709.0, "step": 1322 }, { "entropy": 0.9344121664762497, "epoch": 0.7074866310160428, "grad_norm": 0.15921074151992798, "learning_rate": 5.761795038131512e-05, "loss": 0.9272, "mean_token_accuracy": 0.765518069267273, "num_tokens": 43142477.0, "step": 1323 }, { "entropy": 0.9707403481006622, "epoch": 0.7080213903743315, "grad_norm": 0.18937453627586365, "learning_rate": 5.749121215190584e-05, "loss": 0.9742, "mean_token_accuracy": 0.7521077692508698, "num_tokens": 43175245.0, "step": 1324 }, { "entropy": 0.9573013186454773, "epoch": 0.7085561497326203, "grad_norm": 0.16704417765140533, "learning_rate": 5.7364631587760195e-05, "loss": 0.9533, "mean_token_accuracy": 0.7544599175453186, "num_tokens": 43208013.0, "step": 1325 }, { "entropy": 0.9601723700761795, "epoch": 0.7090909090909091, "grad_norm": 0.16236352920532227, "learning_rate": 5.7238209068954765e-05, "loss": 0.9446, "mean_token_accuracy": 0.7608654052019119, "num_tokens": 43240510.0, "step": 1326 }, { "entropy": 1.0053239911794662, "epoch": 0.7096256684491978, "grad_norm": 0.16887569427490234, "learning_rate": 5.7111944975091445e-05, "loss": 0.9963, "mean_token_accuracy": 0.7504582107067108, "num_tokens": 43273278.0, "step": 1327 }, { "entropy": 0.9479286074638367, "epoch": 0.7101604278074867, "grad_norm": 0.166823610663414, "learning_rate": 5.698583968529657e-05, "loss": 0.9481, "mean_token_accuracy": 0.7575452029705048, "num_tokens": 43306046.0, "step": 1328 }, { "entropy": 0.972056582570076, "epoch": 0.7106951871657754, "grad_norm": 0.19135132431983948, "learning_rate": 5.6859893578219616e-05, "loss": 0.9712, "mean_token_accuracy": 0.7543448656797409, "num_tokens": 43338499.0, "step": 1329 }, { "entropy": 0.9440675228834152, "epoch": 0.7112299465240641, "grad_norm": 0.16059821844100952, "learning_rate": 5.673410703203204e-05, "loss": 0.9329, "mean_token_accuracy": 0.7643572688102722, "num_tokens": 43371267.0, "step": 1330 }, { "entropy": 0.9147678464651108, "epoch": 0.711764705882353, "grad_norm": 0.16524194180965424, "learning_rate": 5.660848042442617e-05, "loss": 0.9154, "mean_token_accuracy": 0.7649071216583252, "num_tokens": 43404035.0, "step": 1331 }, { "entropy": 0.9377564489841461, "epoch": 0.7122994652406417, "grad_norm": 0.17137596011161804, "learning_rate": 5.648301413261421e-05, "loss": 0.9274, "mean_token_accuracy": 0.7636241465806961, "num_tokens": 43436803.0, "step": 1332 }, { "entropy": 0.9109043776988983, "epoch": 0.7128342245989305, "grad_norm": 0.17169232666492462, "learning_rate": 5.635770853332692e-05, "loss": 0.9172, "mean_token_accuracy": 0.7651301324367523, "num_tokens": 43469114.0, "step": 1333 }, { "entropy": 0.955856204032898, "epoch": 0.7133689839572193, "grad_norm": 0.1743943989276886, "learning_rate": 5.623256400281254e-05, "loss": 0.9703, "mean_token_accuracy": 0.7509894818067551, "num_tokens": 43501360.0, "step": 1334 }, { "entropy": 0.9887515902519226, "epoch": 0.713903743315508, "grad_norm": 0.17928454279899597, "learning_rate": 5.610758091683571e-05, "loss": 0.9901, "mean_token_accuracy": 0.74660924077034, "num_tokens": 43534128.0, "step": 1335 }, { "entropy": 0.9664270430803299, "epoch": 0.7144385026737968, "grad_norm": 0.16999004781246185, "learning_rate": 5.598275965067634e-05, "loss": 0.9711, "mean_token_accuracy": 0.7529754191637039, "num_tokens": 43566812.0, "step": 1336 }, { "entropy": 0.9780944734811783, "epoch": 0.7149732620320856, "grad_norm": 0.16349083185195923, "learning_rate": 5.585810057912839e-05, "loss": 0.9788, "mean_token_accuracy": 0.7536194622516632, "num_tokens": 43599361.0, "step": 1337 }, { "entropy": 1.0061749964952469, "epoch": 0.7155080213903743, "grad_norm": 0.1756104975938797, "learning_rate": 5.5733604076498793e-05, "loss": 1.0217, "mean_token_accuracy": 0.7461908757686615, "num_tokens": 43631461.0, "step": 1338 }, { "entropy": 0.9739767909049988, "epoch": 0.7160427807486631, "grad_norm": 0.16898559033870697, "learning_rate": 5.5609270516606425e-05, "loss": 0.9669, "mean_token_accuracy": 0.7514312863349915, "num_tokens": 43664011.0, "step": 1339 }, { "entropy": 0.9885522723197937, "epoch": 0.7165775401069518, "grad_norm": 0.16172735393047333, "learning_rate": 5.5485100272780864e-05, "loss": 0.9784, "mean_token_accuracy": 0.750897690653801, "num_tokens": 43696719.0, "step": 1340 }, { "entropy": 0.9710711240768433, "epoch": 0.7171122994652407, "grad_norm": 0.16851507127285004, "learning_rate": 5.536109371786128e-05, "loss": 0.9656, "mean_token_accuracy": 0.7565042674541473, "num_tokens": 43729377.0, "step": 1341 }, { "entropy": 0.9953486323356628, "epoch": 0.7176470588235294, "grad_norm": 0.17063665390014648, "learning_rate": 5.5237251224195384e-05, "loss": 1.0157, "mean_token_accuracy": 0.7456031739711761, "num_tokens": 43761773.0, "step": 1342 }, { "entropy": 0.9645649939775467, "epoch": 0.7181818181818181, "grad_norm": 0.17601872980594635, "learning_rate": 5.5113573163638275e-05, "loss": 0.9631, "mean_token_accuracy": 0.7569347620010376, "num_tokens": 43794432.0, "step": 1343 }, { "entropy": 0.9721208810806274, "epoch": 0.718716577540107, "grad_norm": 0.16801919043064117, "learning_rate": 5.4990059907551305e-05, "loss": 0.9477, "mean_token_accuracy": 0.7608031928539276, "num_tokens": 43826989.0, "step": 1344 }, { "entropy": 0.9556037187576294, "epoch": 0.7192513368983957, "grad_norm": 0.1672016680240631, "learning_rate": 5.4866711826800885e-05, "loss": 0.9584, "mean_token_accuracy": 0.7554068863391876, "num_tokens": 43859757.0, "step": 1345 }, { "entropy": 0.9977209866046906, "epoch": 0.7197860962566844, "grad_norm": 0.17609484493732452, "learning_rate": 5.474352929175761e-05, "loss": 1.01, "mean_token_accuracy": 0.7487013041973114, "num_tokens": 43892321.0, "step": 1346 }, { "entropy": 0.9755626320838928, "epoch": 0.7203208556149733, "grad_norm": 0.17414574325084686, "learning_rate": 5.462051267229493e-05, "loss": 0.9783, "mean_token_accuracy": 0.7515273690223694, "num_tokens": 43925089.0, "step": 1347 }, { "entropy": 0.9573537856340408, "epoch": 0.720855614973262, "grad_norm": 0.18283627927303314, "learning_rate": 5.449766233778815e-05, "loss": 0.9441, "mean_token_accuracy": 0.7602858394384384, "num_tokens": 43957550.0, "step": 1348 }, { "entropy": 0.9958482980728149, "epoch": 0.7213903743315508, "grad_norm": 0.18237966299057007, "learning_rate": 5.4374978657113185e-05, "loss": 0.9848, "mean_token_accuracy": 0.750946968793869, "num_tokens": 43990318.0, "step": 1349 }, { "entropy": 0.9420408457517624, "epoch": 0.7219251336898396, "grad_norm": 0.16323111951351166, "learning_rate": 5.4252461998645654e-05, "loss": 0.9268, "mean_token_accuracy": 0.7569953501224518, "num_tokens": 44023086.0, "step": 1350 }, { "entropy": 0.980293333530426, "epoch": 0.7224598930481283, "grad_norm": 0.20707236230373383, "learning_rate": 5.4130112730259574e-05, "loss": 0.9757, "mean_token_accuracy": 0.7540020644664764, "num_tokens": 44055838.0, "step": 1351 }, { "entropy": 0.9224491715431213, "epoch": 0.7229946524064171, "grad_norm": 0.19375093281269073, "learning_rate": 5.400793121932647e-05, "loss": 0.9268, "mean_token_accuracy": 0.7603861093521118, "num_tokens": 44088606.0, "step": 1352 }, { "entropy": 0.9549875855445862, "epoch": 0.7235294117647059, "grad_norm": 0.16448384523391724, "learning_rate": 5.3885917832713996e-05, "loss": 0.9622, "mean_token_accuracy": 0.7590114772319794, "num_tokens": 44121374.0, "step": 1353 }, { "entropy": 0.9891581833362579, "epoch": 0.7240641711229947, "grad_norm": 0.18951919674873352, "learning_rate": 5.376407293678511e-05, "loss": 1.0005, "mean_token_accuracy": 0.7460899353027344, "num_tokens": 44154142.0, "step": 1354 }, { "entropy": 0.9467734545469284, "epoch": 0.7245989304812834, "grad_norm": 0.18200156092643738, "learning_rate": 5.364239689739685e-05, "loss": 0.9452, "mean_token_accuracy": 0.7579423189163208, "num_tokens": 44186910.0, "step": 1355 }, { "entropy": 0.9351662695407867, "epoch": 0.7251336898395722, "grad_norm": 0.16058847308158875, "learning_rate": 5.3520890079899124e-05, "loss": 0.9285, "mean_token_accuracy": 0.7623175531625748, "num_tokens": 44219504.0, "step": 1356 }, { "entropy": 0.9368904829025269, "epoch": 0.725668449197861, "grad_norm": 0.17099831998348236, "learning_rate": 5.339955284913385e-05, "loss": 0.9531, "mean_token_accuracy": 0.7577284872531891, "num_tokens": 44252272.0, "step": 1357 }, { "entropy": 0.9771641194820404, "epoch": 0.7262032085561497, "grad_norm": 0.18690308928489685, "learning_rate": 5.3278385569433695e-05, "loss": 0.9681, "mean_token_accuracy": 0.7535831779241562, "num_tokens": 44285018.0, "step": 1358 }, { "entropy": 0.9384803473949432, "epoch": 0.7267379679144385, "grad_norm": 0.16335232555866241, "learning_rate": 5.315738860462103e-05, "loss": 0.9327, "mean_token_accuracy": 0.7607322484254837, "num_tokens": 44317570.0, "step": 1359 }, { "entropy": 0.9756060987710953, "epoch": 0.7272727272727273, "grad_norm": 0.17174574732780457, "learning_rate": 5.303656231800675e-05, "loss": 0.9725, "mean_token_accuracy": 0.7532685697078705, "num_tokens": 44350338.0, "step": 1360 }, { "entropy": 0.9902632385492325, "epoch": 0.727807486631016, "grad_norm": 0.1803690642118454, "learning_rate": 5.2915907072389406e-05, "loss": 0.986, "mean_token_accuracy": 0.7520039975643158, "num_tokens": 44382125.0, "step": 1361 }, { "entropy": 0.9341016560792923, "epoch": 0.7283422459893049, "grad_norm": 0.1689465492963791, "learning_rate": 5.279542323005387e-05, "loss": 0.9396, "mean_token_accuracy": 0.7578506767749786, "num_tokens": 44414893.0, "step": 1362 }, { "entropy": 0.9944004416465759, "epoch": 0.7288770053475936, "grad_norm": 0.16927452385425568, "learning_rate": 5.2675111152770385e-05, "loss": 0.9936, "mean_token_accuracy": 0.7520087957382202, "num_tokens": 44447437.0, "step": 1363 }, { "entropy": 0.9670007675886154, "epoch": 0.7294117647058823, "grad_norm": 0.16915826499462128, "learning_rate": 5.2554971201793424e-05, "loss": 0.96, "mean_token_accuracy": 0.756583958864212, "num_tokens": 44480034.0, "step": 1364 }, { "entropy": 0.9163489788770676, "epoch": 0.7299465240641712, "grad_norm": 0.16509558260440826, "learning_rate": 5.24350037378607e-05, "loss": 0.9029, "mean_token_accuracy": 0.7656762450933456, "num_tokens": 44512732.0, "step": 1365 }, { "entropy": 0.9414806365966797, "epoch": 0.7304812834224599, "grad_norm": 0.16583573818206787, "learning_rate": 5.2315209121191924e-05, "loss": 0.9358, "mean_token_accuracy": 0.7630437314510345, "num_tokens": 44545500.0, "step": 1366 }, { "entropy": 0.9694929867982864, "epoch": 0.7310160427807486, "grad_norm": 0.16455243527889252, "learning_rate": 5.2195587711487784e-05, "loss": 0.9687, "mean_token_accuracy": 0.7541544437408447, "num_tokens": 44578268.0, "step": 1367 }, { "entropy": 0.9365357607603073, "epoch": 0.7315508021390374, "grad_norm": 0.163929283618927, "learning_rate": 5.207613986792899e-05, "loss": 0.9279, "mean_token_accuracy": 0.7610830962657928, "num_tokens": 44610993.0, "step": 1368 }, { "entropy": 0.9800110459327698, "epoch": 0.7320855614973262, "grad_norm": 0.16541478037834167, "learning_rate": 5.195686594917504e-05, "loss": 0.9809, "mean_token_accuracy": 0.7503935098648071, "num_tokens": 44643141.0, "step": 1369 }, { "entropy": 1.0133154392242432, "epoch": 0.732620320855615, "grad_norm": 0.17673271894454956, "learning_rate": 5.183776631336319e-05, "loss": 1.0329, "mean_token_accuracy": 0.7408357262611389, "num_tokens": 44675581.0, "step": 1370 }, { "entropy": 0.9362034201622009, "epoch": 0.7331550802139037, "grad_norm": 0.16646535694599152, "learning_rate": 5.171884131810739e-05, "loss": 0.937, "mean_token_accuracy": 0.7587976455688477, "num_tokens": 44708349.0, "step": 1371 }, { "entropy": 0.9442934691905975, "epoch": 0.7336898395721925, "grad_norm": 0.17669664323329926, "learning_rate": 5.160009132049728e-05, "loss": 0.9532, "mean_token_accuracy": 0.7564622759819031, "num_tokens": 44740948.0, "step": 1372 }, { "entropy": 0.9387456774711609, "epoch": 0.7342245989304813, "grad_norm": 0.19216324388980865, "learning_rate": 5.148151667709693e-05, "loss": 0.9448, "mean_token_accuracy": 0.7606610357761383, "num_tokens": 44773716.0, "step": 1373 }, { "entropy": 0.9824355393648148, "epoch": 0.73475935828877, "grad_norm": 0.16717301309108734, "learning_rate": 5.136311774394389e-05, "loss": 0.9823, "mean_token_accuracy": 0.7504276633262634, "num_tokens": 44806484.0, "step": 1374 }, { "entropy": 0.9298852831125259, "epoch": 0.7352941176470589, "grad_norm": 0.1649925261735916, "learning_rate": 5.12448948765482e-05, "loss": 0.9268, "mean_token_accuracy": 0.7589503824710846, "num_tokens": 44839252.0, "step": 1375 }, { "entropy": 0.8915909230709076, "epoch": 0.7358288770053476, "grad_norm": 0.1836596429347992, "learning_rate": 5.1126848429891214e-05, "loss": 0.8939, "mean_token_accuracy": 0.7735643535852432, "num_tokens": 44871729.0, "step": 1376 }, { "entropy": 0.9786400496959686, "epoch": 0.7363636363636363, "grad_norm": 0.18812385201454163, "learning_rate": 5.1008978758424545e-05, "loss": 0.9731, "mean_token_accuracy": 0.7497861683368683, "num_tokens": 44904497.0, "step": 1377 }, { "entropy": 0.8944540321826935, "epoch": 0.7368983957219252, "grad_norm": 0.1752358078956604, "learning_rate": 5.089128621606892e-05, "loss": 0.8838, "mean_token_accuracy": 0.7728800028562546, "num_tokens": 44937265.0, "step": 1378 }, { "entropy": 0.9364291578531265, "epoch": 0.7374331550802139, "grad_norm": 0.22087101638317108, "learning_rate": 5.077377115621341e-05, "loss": 0.9376, "mean_token_accuracy": 0.7617378532886505, "num_tokens": 44969888.0, "step": 1379 }, { "entropy": 0.9458759129047394, "epoch": 0.7379679144385026, "grad_norm": 0.1907070279121399, "learning_rate": 5.065643393171398e-05, "loss": 0.9295, "mean_token_accuracy": 0.7640746533870697, "num_tokens": 45002567.0, "step": 1380 }, { "entropy": 0.9374886602163315, "epoch": 0.7385026737967915, "grad_norm": 0.1655336618423462, "learning_rate": 5.0539274894892695e-05, "loss": 0.9246, "mean_token_accuracy": 0.7612414360046387, "num_tokens": 45035335.0, "step": 1381 }, { "entropy": 0.9554038047790527, "epoch": 0.7390374331550802, "grad_norm": 0.1619529128074646, "learning_rate": 5.042229439753659e-05, "loss": 0.951, "mean_token_accuracy": 0.7614552676677704, "num_tokens": 45068103.0, "step": 1382 }, { "entropy": 0.9579029679298401, "epoch": 0.739572192513369, "grad_norm": 0.18599477410316467, "learning_rate": 5.0305492790896604e-05, "loss": 0.9598, "mean_token_accuracy": 0.7529549300670624, "num_tokens": 45100450.0, "step": 1383 }, { "entropy": 0.9384265094995499, "epoch": 0.7401069518716578, "grad_norm": 0.17478297650814056, "learning_rate": 5.018887042568656e-05, "loss": 0.9367, "mean_token_accuracy": 0.7592450827360153, "num_tokens": 45132859.0, "step": 1384 }, { "entropy": 0.9385294169187546, "epoch": 0.7406417112299465, "grad_norm": 0.16543041169643402, "learning_rate": 5.0072427652082e-05, "loss": 0.9495, "mean_token_accuracy": 0.7601417303085327, "num_tokens": 45165627.0, "step": 1385 }, { "entropy": 0.9611698240041733, "epoch": 0.7411764705882353, "grad_norm": 0.17224179208278656, "learning_rate": 4.995616481971934e-05, "loss": 0.9802, "mean_token_accuracy": 0.7500352561473846, "num_tokens": 45198288.0, "step": 1386 }, { "entropy": 0.9673365503549576, "epoch": 0.741711229946524, "grad_norm": 0.17497476935386658, "learning_rate": 4.984008227769457e-05, "loss": 0.9681, "mean_token_accuracy": 0.7542460858821869, "num_tokens": 45231056.0, "step": 1387 }, { "entropy": 0.9386740624904633, "epoch": 0.7422459893048128, "grad_norm": 0.18572238087654114, "learning_rate": 4.972418037456236e-05, "loss": 0.9367, "mean_token_accuracy": 0.7579202800989151, "num_tokens": 45263505.0, "step": 1388 }, { "entropy": 0.9856511950492859, "epoch": 0.7427807486631016, "grad_norm": 0.1692817360162735, "learning_rate": 4.960845945833504e-05, "loss": 0.9807, "mean_token_accuracy": 0.7517717480659485, "num_tokens": 45296273.0, "step": 1389 }, { "entropy": 0.9516720473766327, "epoch": 0.7433155080213903, "grad_norm": 0.16946376860141754, "learning_rate": 4.9492919876481485e-05, "loss": 0.9615, "mean_token_accuracy": 0.7579386234283447, "num_tokens": 45328969.0, "step": 1390 }, { "entropy": 1.0015557408332825, "epoch": 0.7438502673796792, "grad_norm": 0.165788471698761, "learning_rate": 4.937756197592609e-05, "loss": 1.0071, "mean_token_accuracy": 0.7453262507915497, "num_tokens": 45361737.0, "step": 1391 }, { "entropy": 0.9206484109163284, "epoch": 0.7443850267379679, "grad_norm": 0.17079102993011475, "learning_rate": 4.9262386103047677e-05, "loss": 0.9099, "mean_token_accuracy": 0.7662206590175629, "num_tokens": 45394505.0, "step": 1392 }, { "entropy": 0.9758808314800262, "epoch": 0.7449197860962566, "grad_norm": 0.1746615320444107, "learning_rate": 4.914739260367856e-05, "loss": 0.9856, "mean_token_accuracy": 0.7506234645843506, "num_tokens": 45427144.0, "step": 1393 }, { "entropy": 0.9476794600486755, "epoch": 0.7454545454545455, "grad_norm": 0.1681424230337143, "learning_rate": 4.903258182310338e-05, "loss": 0.9544, "mean_token_accuracy": 0.7548570334911346, "num_tokens": 45459912.0, "step": 1394 }, { "entropy": 1.0142509192228317, "epoch": 0.7459893048128342, "grad_norm": 0.1720559000968933, "learning_rate": 4.891795410605826e-05, "loss": 1.006, "mean_token_accuracy": 0.74620620906353, "num_tokens": 45492002.0, "step": 1395 }, { "entropy": 0.9728702455759048, "epoch": 0.746524064171123, "grad_norm": 0.17016872763633728, "learning_rate": 4.880350979672949e-05, "loss": 0.9642, "mean_token_accuracy": 0.751099705696106, "num_tokens": 45524770.0, "step": 1396 }, { "entropy": 0.9072166979312897, "epoch": 0.7470588235294118, "grad_norm": 0.16708402335643768, "learning_rate": 4.8689249238752786e-05, "loss": 0.9021, "mean_token_accuracy": 0.7652431428432465, "num_tokens": 45557538.0, "step": 1397 }, { "entropy": 0.9732768088579178, "epoch": 0.7475935828877005, "grad_norm": 0.16878630220890045, "learning_rate": 4.857517277521208e-05, "loss": 0.9693, "mean_token_accuracy": 0.7514357268810272, "num_tokens": 45590306.0, "step": 1398 }, { "entropy": 0.9616824388504028, "epoch": 0.7481283422459893, "grad_norm": 0.16568832099437714, "learning_rate": 4.8461280748638485e-05, "loss": 0.9579, "mean_token_accuracy": 0.7586461007595062, "num_tokens": 45622625.0, "step": 1399 }, { "entropy": 0.9650434404611588, "epoch": 0.7486631016042781, "grad_norm": 0.18817420303821564, "learning_rate": 4.83475735010094e-05, "loss": 0.9589, "mean_token_accuracy": 0.7563316226005554, "num_tokens": 45655250.0, "step": 1400 }, { "entropy": 0.9968360215425491, "epoch": 0.7491978609625668, "grad_norm": 0.1672886610031128, "learning_rate": 4.823405137374731e-05, "loss": 0.9932, "mean_token_accuracy": 0.7512084990739822, "num_tokens": 45687908.0, "step": 1401 }, { "entropy": 0.9780010432004929, "epoch": 0.7497326203208556, "grad_norm": 0.16161589324474335, "learning_rate": 4.812071470771895e-05, "loss": 0.9612, "mean_token_accuracy": 0.7587670981884003, "num_tokens": 45720676.0, "step": 1402 }, { "entropy": 0.9526931345462799, "epoch": 0.7502673796791444, "grad_norm": 0.16700731217861176, "learning_rate": 4.8007563843234055e-05, "loss": 0.9336, "mean_token_accuracy": 0.7571175396442413, "num_tokens": 45753444.0, "step": 1403 }, { "entropy": 0.9786204248666763, "epoch": 0.7508021390374332, "grad_norm": 0.16651296615600586, "learning_rate": 4.789459912004456e-05, "loss": 0.967, "mean_token_accuracy": 0.7521077692508698, "num_tokens": 45786212.0, "step": 1404 }, { "entropy": 0.9743318110704422, "epoch": 0.7513368983957219, "grad_norm": 0.17126482725143433, "learning_rate": 4.7781820877343484e-05, "loss": 0.9664, "mean_token_accuracy": 0.7522910535335541, "num_tokens": 45818980.0, "step": 1405 }, { "entropy": 1.0189561545848846, "epoch": 0.7518716577540107, "grad_norm": 0.17651601135730743, "learning_rate": 4.766922945376389e-05, "loss": 1.0309, "mean_token_accuracy": 0.7422950267791748, "num_tokens": 45851006.0, "step": 1406 }, { "entropy": 0.9401010423898697, "epoch": 0.7524064171122995, "grad_norm": 0.16605083644390106, "learning_rate": 4.755682518737784e-05, "loss": 0.9318, "mean_token_accuracy": 0.7648623734712601, "num_tokens": 45883635.0, "step": 1407 }, { "entropy": 0.966943696141243, "epoch": 0.7529411764705882, "grad_norm": 0.1740500032901764, "learning_rate": 4.744460841569551e-05, "loss": 0.9731, "mean_token_accuracy": 0.7538737058639526, "num_tokens": 45915984.0, "step": 1408 }, { "entropy": 0.9927944242954254, "epoch": 0.753475935828877, "grad_norm": 0.1688157021999359, "learning_rate": 4.733257947566405e-05, "loss": 0.9948, "mean_token_accuracy": 0.7516429573297501, "num_tokens": 45948312.0, "step": 1409 }, { "entropy": 0.945795476436615, "epoch": 0.7540106951871658, "grad_norm": 0.17462849617004395, "learning_rate": 4.7220738703666594e-05, "loss": 0.9536, "mean_token_accuracy": 0.7565722614526749, "num_tokens": 45980996.0, "step": 1410 }, { "entropy": 0.9234698563814163, "epoch": 0.7545454545454545, "grad_norm": 0.17292383313179016, "learning_rate": 4.710908643552133e-05, "loss": 0.9326, "mean_token_accuracy": 0.7631048262119293, "num_tokens": 46013764.0, "step": 1411 }, { "entropy": 0.9526192247867584, "epoch": 0.7550802139037434, "grad_norm": 0.1689281016588211, "learning_rate": 4.699762300648042e-05, "loss": 0.9623, "mean_token_accuracy": 0.754096120595932, "num_tokens": 46046127.0, "step": 1412 }, { "entropy": 0.9535632431507111, "epoch": 0.7556149732620321, "grad_norm": 0.17146529257297516, "learning_rate": 4.6886348751229025e-05, "loss": 0.9543, "mean_token_accuracy": 0.7565059959888458, "num_tokens": 46078580.0, "step": 1413 }, { "entropy": 0.9169730544090271, "epoch": 0.7561497326203208, "grad_norm": 0.16838543117046356, "learning_rate": 4.677526400388421e-05, "loss": 0.9041, "mean_token_accuracy": 0.7646932899951935, "num_tokens": 46111348.0, "step": 1414 }, { "entropy": 0.9166508615016937, "epoch": 0.7566844919786097, "grad_norm": 0.17483341693878174, "learning_rate": 4.6664369097994124e-05, "loss": 0.921, "mean_token_accuracy": 0.7638685256242752, "num_tokens": 46144116.0, "step": 1415 }, { "entropy": 0.9559438526630402, "epoch": 0.7572192513368984, "grad_norm": 0.1699785441160202, "learning_rate": 4.6553664366536794e-05, "loss": 0.9508, "mean_token_accuracy": 0.7549486756324768, "num_tokens": 46176884.0, "step": 1416 }, { "entropy": 0.9044709354639053, "epoch": 0.7577540106951872, "grad_norm": 0.16961148381233215, "learning_rate": 4.6443150141919245e-05, "loss": 0.9067, "mean_token_accuracy": 0.767368733882904, "num_tokens": 46209291.0, "step": 1417 }, { "entropy": 0.918102815747261, "epoch": 0.758288770053476, "grad_norm": 0.16606849431991577, "learning_rate": 4.633282675597651e-05, "loss": 0.9129, "mean_token_accuracy": 0.7693549543619156, "num_tokens": 46241998.0, "step": 1418 }, { "entropy": 0.9878501445055008, "epoch": 0.7588235294117647, "grad_norm": 0.18879282474517822, "learning_rate": 4.622269453997057e-05, "loss": 0.9881, "mean_token_accuracy": 0.7488882839679718, "num_tokens": 46274726.0, "step": 1419 }, { "entropy": 0.946536123752594, "epoch": 0.7593582887700535, "grad_norm": 0.17403647303581238, "learning_rate": 4.611275382458947e-05, "loss": 0.9443, "mean_token_accuracy": 0.7589809447526932, "num_tokens": 46307494.0, "step": 1420 }, { "entropy": 0.9222494065761566, "epoch": 0.7598930481283422, "grad_norm": 0.179395392537117, "learning_rate": 4.6003004939946076e-05, "loss": 0.9275, "mean_token_accuracy": 0.7654569745063782, "num_tokens": 46340262.0, "step": 1421 }, { "entropy": 0.8983130007982254, "epoch": 0.760427807486631, "grad_norm": 0.18136689066886902, "learning_rate": 4.589344821557744e-05, "loss": 0.9022, "mean_token_accuracy": 0.7665067464113235, "num_tokens": 46372697.0, "step": 1422 }, { "entropy": 0.9430101960897446, "epoch": 0.7609625668449198, "grad_norm": 0.17819227278232574, "learning_rate": 4.578408398044349e-05, "loss": 0.9438, "mean_token_accuracy": 0.7585532665252686, "num_tokens": 46405465.0, "step": 1423 }, { "entropy": 0.9201874732971191, "epoch": 0.7614973262032085, "grad_norm": 0.17150764167308807, "learning_rate": 4.5674912562926284e-05, "loss": 0.9198, "mean_token_accuracy": 0.7630615085363388, "num_tokens": 46437886.0, "step": 1424 }, { "entropy": 0.9516346603631973, "epoch": 0.7620320855614974, "grad_norm": 0.19696636497974396, "learning_rate": 4.556593429082879e-05, "loss": 0.9477, "mean_token_accuracy": 0.7558687180280685, "num_tokens": 46470408.0, "step": 1425 }, { "entropy": 0.9484929591417313, "epoch": 0.7625668449197861, "grad_norm": 0.18077801167964935, "learning_rate": 4.545714949137414e-05, "loss": 0.9408, "mean_token_accuracy": 0.7573008239269257, "num_tokens": 46503176.0, "step": 1426 }, { "entropy": 0.9580115526914597, "epoch": 0.7631016042780748, "grad_norm": 0.1708674430847168, "learning_rate": 4.534855849120453e-05, "loss": 0.9537, "mean_token_accuracy": 0.7538977414369583, "num_tokens": 46535857.0, "step": 1427 }, { "entropy": 0.8961019665002823, "epoch": 0.7636363636363637, "grad_norm": 0.17240260541439056, "learning_rate": 4.524016161638016e-05, "loss": 0.9055, "mean_token_accuracy": 0.7688349038362503, "num_tokens": 46568295.0, "step": 1428 }, { "entropy": 0.9304610937833786, "epoch": 0.7641711229946524, "grad_norm": 0.1705794334411621, "learning_rate": 4.513195919237843e-05, "loss": 0.9144, "mean_token_accuracy": 0.7627077251672745, "num_tokens": 46601063.0, "step": 1429 }, { "entropy": 0.9417256414890289, "epoch": 0.7647058823529411, "grad_norm": 0.1612071841955185, "learning_rate": 4.502395154409281e-05, "loss": 0.9327, "mean_token_accuracy": 0.7602944672107697, "num_tokens": 46633831.0, "step": 1430 }, { "entropy": 0.9605920910835266, "epoch": 0.76524064171123, "grad_norm": 0.16719336807727814, "learning_rate": 4.491613899583198e-05, "loss": 0.9534, "mean_token_accuracy": 0.7547348439693451, "num_tokens": 46666599.0, "step": 1431 }, { "entropy": 0.9498101323843002, "epoch": 0.7657754010695187, "grad_norm": 0.16406437754631042, "learning_rate": 4.4808521871318744e-05, "loss": 0.9423, "mean_token_accuracy": 0.755987286567688, "num_tokens": 46699367.0, "step": 1432 }, { "entropy": 0.9573682099580765, "epoch": 0.7663101604278075, "grad_norm": 0.17057931423187256, "learning_rate": 4.470110049368919e-05, "loss": 0.967, "mean_token_accuracy": 0.7498778104782104, "num_tokens": 46732135.0, "step": 1433 }, { "entropy": 0.9842314422130585, "epoch": 0.7668449197860963, "grad_norm": 0.1765008121728897, "learning_rate": 4.4593875185491574e-05, "loss": 0.9826, "mean_token_accuracy": 0.7512218952178955, "num_tokens": 46764903.0, "step": 1434 }, { "entropy": 0.9449929594993591, "epoch": 0.767379679144385, "grad_norm": 0.16481998562812805, "learning_rate": 4.4486846268685513e-05, "loss": 0.928, "mean_token_accuracy": 0.76295405626297, "num_tokens": 46797237.0, "step": 1435 }, { "entropy": 0.9285380840301514, "epoch": 0.7679144385026738, "grad_norm": 0.1714523881673813, "learning_rate": 4.438001406464084e-05, "loss": 0.9325, "mean_token_accuracy": 0.7573924660682678, "num_tokens": 46830005.0, "step": 1436 }, { "entropy": 0.9558539241552353, "epoch": 0.7684491978609626, "grad_norm": 0.17213749885559082, "learning_rate": 4.427337889413675e-05, "loss": 0.9676, "mean_token_accuracy": 0.7559363394975662, "num_tokens": 46862617.0, "step": 1437 }, { "entropy": 0.9706854373216629, "epoch": 0.7689839572192514, "grad_norm": 0.17222625017166138, "learning_rate": 4.41669410773609e-05, "loss": 0.973, "mean_token_accuracy": 0.7512218952178955, "num_tokens": 46895385.0, "step": 1438 }, { "entropy": 0.961706668138504, "epoch": 0.7695187165775401, "grad_norm": 0.17626358568668365, "learning_rate": 4.406070093390821e-05, "loss": 0.9669, "mean_token_accuracy": 0.756006732583046, "num_tokens": 46928143.0, "step": 1439 }, { "entropy": 0.9379317164421082, "epoch": 0.7700534759358288, "grad_norm": 0.17700813710689545, "learning_rate": 4.395465878278019e-05, "loss": 0.9395, "mean_token_accuracy": 0.7630215734243393, "num_tokens": 46960844.0, "step": 1440 }, { "entropy": 0.9566432982683182, "epoch": 0.7705882352941177, "grad_norm": 0.1715305745601654, "learning_rate": 4.384881494238381e-05, "loss": 0.9554, "mean_token_accuracy": 0.7504063248634338, "num_tokens": 46993497.0, "step": 1441 }, { "entropy": 0.9218139499425888, "epoch": 0.7711229946524064, "grad_norm": 0.1674298495054245, "learning_rate": 4.37431697305306e-05, "loss": 0.9266, "mean_token_accuracy": 0.76496821641922, "num_tokens": 47026265.0, "step": 1442 }, { "entropy": 0.9717037230730057, "epoch": 0.7716577540106951, "grad_norm": 0.1699391007423401, "learning_rate": 4.363772346443561e-05, "loss": 0.9604, "mean_token_accuracy": 0.7572091817855835, "num_tokens": 47059033.0, "step": 1443 }, { "entropy": 0.9532619267702103, "epoch": 0.772192513368984, "grad_norm": 0.17742779850959778, "learning_rate": 4.353247646071657e-05, "loss": 0.9679, "mean_token_accuracy": 0.753029465675354, "num_tokens": 47091785.0, "step": 1444 }, { "entropy": 0.9997300058603287, "epoch": 0.7727272727272727, "grad_norm": 0.17780952155590057, "learning_rate": 4.342742903539294e-05, "loss": 1.01, "mean_token_accuracy": 0.7437186241149902, "num_tokens": 47124363.0, "step": 1445 }, { "entropy": 0.9187695980072021, "epoch": 0.7732620320855615, "grad_norm": 0.16247506439685822, "learning_rate": 4.332258150388484e-05, "loss": 0.91, "mean_token_accuracy": 0.7681971341371536, "num_tokens": 47156893.0, "step": 1446 }, { "entropy": 0.9684152752161026, "epoch": 0.7737967914438503, "grad_norm": 0.1999502331018448, "learning_rate": 4.321793418101221e-05, "loss": 0.96, "mean_token_accuracy": 0.7576979398727417, "num_tokens": 47189661.0, "step": 1447 }, { "entropy": 0.9522790312767029, "epoch": 0.774331550802139, "grad_norm": 0.16581189632415771, "learning_rate": 4.3113487380993865e-05, "loss": 0.9267, "mean_token_accuracy": 0.7602584809064865, "num_tokens": 47222039.0, "step": 1448 }, { "entropy": 0.9538277089595795, "epoch": 0.7748663101604278, "grad_norm": 0.1805843710899353, "learning_rate": 4.3009241417446514e-05, "loss": 0.953, "mean_token_accuracy": 0.7612296044826508, "num_tokens": 47254252.0, "step": 1449 }, { "entropy": 0.9197836816310883, "epoch": 0.7754010695187166, "grad_norm": 0.19354727864265442, "learning_rate": 4.290519660338377e-05, "loss": 0.9113, "mean_token_accuracy": 0.7670795321464539, "num_tokens": 47286900.0, "step": 1450 }, { "entropy": 0.9822978228330612, "epoch": 0.7759358288770053, "grad_norm": 0.1752813756465912, "learning_rate": 4.280135325121529e-05, "loss": 0.9764, "mean_token_accuracy": 0.7531676441431046, "num_tokens": 47319401.0, "step": 1451 }, { "entropy": 0.984327420592308, "epoch": 0.7764705882352941, "grad_norm": 0.1943073570728302, "learning_rate": 4.269771167274585e-05, "loss": 0.9789, "mean_token_accuracy": 0.7508493363857269, "num_tokens": 47352055.0, "step": 1452 }, { "entropy": 0.960977628827095, "epoch": 0.7770053475935829, "grad_norm": 0.18327106535434723, "learning_rate": 4.2594272179174367e-05, "loss": 0.9565, "mean_token_accuracy": 0.7579166293144226, "num_tokens": 47384554.0, "step": 1453 }, { "entropy": 0.9817952513694763, "epoch": 0.7775401069518717, "grad_norm": 0.17658685147762299, "learning_rate": 4.2491035081092894e-05, "loss": 0.9837, "mean_token_accuracy": 0.7525317072868347, "num_tokens": 47417246.0, "step": 1454 }, { "entropy": 0.9385758340358734, "epoch": 0.7780748663101604, "grad_norm": 0.18255873024463654, "learning_rate": 4.238800068848585e-05, "loss": 0.9523, "mean_token_accuracy": 0.7575452029705048, "num_tokens": 47450014.0, "step": 1455 }, { "entropy": 0.9425090700387955, "epoch": 0.7786096256684492, "grad_norm": 0.1812393069267273, "learning_rate": 4.2285169310728976e-05, "loss": 0.9358, "mean_token_accuracy": 0.7598954886198044, "num_tokens": 47482235.0, "step": 1456 }, { "entropy": 0.9353173077106476, "epoch": 0.779144385026738, "grad_norm": 0.16947653889656067, "learning_rate": 4.2182541256588415e-05, "loss": 0.923, "mean_token_accuracy": 0.7609665244817734, "num_tokens": 47515003.0, "step": 1457 }, { "entropy": 0.9787223637104034, "epoch": 0.7796791443850267, "grad_norm": 0.16849343478679657, "learning_rate": 4.2080116834219766e-05, "loss": 0.9569, "mean_token_accuracy": 0.756384402513504, "num_tokens": 47547771.0, "step": 1458 }, { "entropy": 0.9411111623048782, "epoch": 0.7802139037433156, "grad_norm": 0.17539463937282562, "learning_rate": 4.197789635116728e-05, "loss": 0.9509, "mean_token_accuracy": 0.7613932490348816, "num_tokens": 47580153.0, "step": 1459 }, { "entropy": 0.9682489633560181, "epoch": 0.7807486631016043, "grad_norm": 0.2003321647644043, "learning_rate": 4.1875880114362826e-05, "loss": 0.9712, "mean_token_accuracy": 0.753556489944458, "num_tokens": 47612902.0, "step": 1460 }, { "entropy": 0.9437247663736343, "epoch": 0.781283422459893, "grad_norm": 0.17065325379371643, "learning_rate": 4.1774068430124926e-05, "loss": 0.9458, "mean_token_accuracy": 0.7608443200588226, "num_tokens": 47645670.0, "step": 1461 }, { "entropy": 0.9649447053670883, "epoch": 0.7818181818181819, "grad_norm": 0.1729692965745926, "learning_rate": 4.1672461604157964e-05, "loss": 0.9773, "mean_token_accuracy": 0.7494195997714996, "num_tokens": 47678438.0, "step": 1462 }, { "entropy": 0.9338851720094681, "epoch": 0.7823529411764706, "grad_norm": 0.169850692152977, "learning_rate": 4.1571059941551206e-05, "loss": 0.9375, "mean_token_accuracy": 0.7621547877788544, "num_tokens": 47710902.0, "step": 1463 }, { "entropy": 0.9382988661527634, "epoch": 0.7828877005347593, "grad_norm": 0.17178453505039215, "learning_rate": 4.1469863746777904e-05, "loss": 0.9307, "mean_token_accuracy": 0.7574306279420853, "num_tokens": 47743518.0, "step": 1464 }, { "entropy": 0.8995357155799866, "epoch": 0.7834224598930482, "grad_norm": 0.17019139230251312, "learning_rate": 4.13688733236943e-05, "loss": 0.8944, "mean_token_accuracy": 0.7671439498662949, "num_tokens": 47775655.0, "step": 1465 }, { "entropy": 0.9421730637550354, "epoch": 0.7839572192513369, "grad_norm": 0.1680029332637787, "learning_rate": 4.12680889755388e-05, "loss": 0.9431, "mean_token_accuracy": 0.7600500881671906, "num_tokens": 47808423.0, "step": 1466 }, { "entropy": 0.9358892440795898, "epoch": 0.7844919786096257, "grad_norm": 0.17465731501579285, "learning_rate": 4.116751100493108e-05, "loss": 0.9384, "mean_token_accuracy": 0.7588892877101898, "num_tokens": 47841191.0, "step": 1467 }, { "entropy": 0.9814324527978897, "epoch": 0.7850267379679144, "grad_norm": 0.17830786108970642, "learning_rate": 4.1067139713871086e-05, "loss": 0.9945, "mean_token_accuracy": 0.7513263672590256, "num_tokens": 47873789.0, "step": 1468 }, { "entropy": 1.0155832320451736, "epoch": 0.7855614973262032, "grad_norm": 0.17654651403427124, "learning_rate": 4.0966975403738194e-05, "loss": 1.0217, "mean_token_accuracy": 0.7456433027982712, "num_tokens": 47906248.0, "step": 1469 }, { "entropy": 0.9782692193984985, "epoch": 0.786096256684492, "grad_norm": 0.17136093974113464, "learning_rate": 4.086701837529032e-05, "loss": 0.9816, "mean_token_accuracy": 0.7524743378162384, "num_tokens": 47939016.0, "step": 1470 }, { "entropy": 0.943878784775734, "epoch": 0.7866310160427807, "grad_norm": 0.20054322481155396, "learning_rate": 4.0767268928662974e-05, "loss": 0.9353, "mean_token_accuracy": 0.7583309412002563, "num_tokens": 47971345.0, "step": 1471 }, { "entropy": 0.9630401432514191, "epoch": 0.7871657754010695, "grad_norm": 0.17285215854644775, "learning_rate": 4.066772736336834e-05, "loss": 0.973, "mean_token_accuracy": 0.7511119246482849, "num_tokens": 48003878.0, "step": 1472 }, { "entropy": 0.9346894472837448, "epoch": 0.7877005347593583, "grad_norm": 0.17615626752376556, "learning_rate": 4.056839397829441e-05, "loss": 0.9282, "mean_token_accuracy": 0.7593169510364532, "num_tokens": 48036646.0, "step": 1473 }, { "entropy": 0.9807346612215042, "epoch": 0.788235294117647, "grad_norm": 0.17368116974830627, "learning_rate": 4.046926907170414e-05, "loss": 0.9778, "mean_token_accuracy": 0.7508859932422638, "num_tokens": 48069049.0, "step": 1474 }, { "entropy": 0.9940079301595688, "epoch": 0.7887700534759359, "grad_norm": 0.17876891791820526, "learning_rate": 4.0370352941234427e-05, "loss": 0.9993, "mean_token_accuracy": 0.7478200197219849, "num_tokens": 48101213.0, "step": 1475 }, { "entropy": 0.9659792631864548, "epoch": 0.7893048128342246, "grad_norm": 0.16823144257068634, "learning_rate": 4.027164588389534e-05, "loss": 0.9756, "mean_token_accuracy": 0.7521383166313171, "num_tokens": 48133981.0, "step": 1476 }, { "entropy": 0.9486778080463409, "epoch": 0.7898395721925133, "grad_norm": 0.17777416110038757, "learning_rate": 4.0173148196069155e-05, "loss": 0.931, "mean_token_accuracy": 0.7633533477783203, "num_tokens": 48166500.0, "step": 1477 }, { "entropy": 0.9380469620227814, "epoch": 0.7903743315508022, "grad_norm": 0.19114357233047485, "learning_rate": 4.007486017350952e-05, "loss": 0.9379, "mean_token_accuracy": 0.7576062977313995, "num_tokens": 48199268.0, "step": 1478 }, { "entropy": 0.9314498156309128, "epoch": 0.7909090909090909, "grad_norm": 0.16669416427612305, "learning_rate": 3.9976782111340466e-05, "loss": 0.9224, "mean_token_accuracy": 0.7631019949913025, "num_tokens": 48231768.0, "step": 1479 }, { "entropy": 0.9650258719921112, "epoch": 0.7914438502673797, "grad_norm": 0.18811096251010895, "learning_rate": 3.987891430405561e-05, "loss": 0.9592, "mean_token_accuracy": 0.7563233077526093, "num_tokens": 48264536.0, "step": 1480 }, { "entropy": 0.9652723371982574, "epoch": 0.7919786096256685, "grad_norm": 0.18682901561260223, "learning_rate": 3.978125704551728e-05, "loss": 0.9596, "mean_token_accuracy": 0.7536351382732391, "num_tokens": 48297304.0, "step": 1481 }, { "entropy": 0.9712996333837509, "epoch": 0.7925133689839572, "grad_norm": 0.17110350728034973, "learning_rate": 3.9683810628955616e-05, "loss": 0.9592, "mean_token_accuracy": 0.755139946937561, "num_tokens": 48329753.0, "step": 1482 }, { "entropy": 0.908642441034317, "epoch": 0.793048128342246, "grad_norm": 0.17509803175926208, "learning_rate": 3.958657534696758e-05, "loss": 0.9078, "mean_token_accuracy": 0.7689491957426071, "num_tokens": 48362216.0, "step": 1483 }, { "entropy": 0.9365021735429764, "epoch": 0.7935828877005348, "grad_norm": 0.18727156519889832, "learning_rate": 3.9489551491516246e-05, "loss": 0.9439, "mean_token_accuracy": 0.753926008939743, "num_tokens": 48394956.0, "step": 1484 }, { "entropy": 0.9669003784656525, "epoch": 0.7941176470588235, "grad_norm": 0.173688605427742, "learning_rate": 3.9392739353929884e-05, "loss": 0.9778, "mean_token_accuracy": 0.7519550323486328, "num_tokens": 48427724.0, "step": 1485 }, { "entropy": 0.9892281740903854, "epoch": 0.7946524064171123, "grad_norm": 0.17844879627227783, "learning_rate": 3.929613922490096e-05, "loss": 0.9798, "mean_token_accuracy": 0.7524132430553436, "num_tokens": 48460492.0, "step": 1486 }, { "entropy": 0.9603982418775558, "epoch": 0.795187165775401, "grad_norm": 0.17463773488998413, "learning_rate": 3.9199751394485385e-05, "loss": 0.9452, "mean_token_accuracy": 0.7603353261947632, "num_tokens": 48492842.0, "step": 1487 }, { "entropy": 0.926127091050148, "epoch": 0.7957219251336899, "grad_norm": 0.18449467420578003, "learning_rate": 3.910357615210163e-05, "loss": 0.9307, "mean_token_accuracy": 0.7617607414722443, "num_tokens": 48525610.0, "step": 1488 }, { "entropy": 0.9539791196584702, "epoch": 0.7962566844919786, "grad_norm": 0.1840931475162506, "learning_rate": 3.900761378652987e-05, "loss": 0.9535, "mean_token_accuracy": 0.7557183653116226, "num_tokens": 48558277.0, "step": 1489 }, { "entropy": 0.9705661684274673, "epoch": 0.7967914438502673, "grad_norm": 0.18575632572174072, "learning_rate": 3.891186458591098e-05, "loss": 0.9756, "mean_token_accuracy": 0.751893937587738, "num_tokens": 48591045.0, "step": 1490 }, { "entropy": 0.9732887297868729, "epoch": 0.7973262032085562, "grad_norm": 0.18022392690181732, "learning_rate": 3.88163288377459e-05, "loss": 0.9794, "mean_token_accuracy": 0.750747948884964, "num_tokens": 48623382.0, "step": 1491 }, { "entropy": 0.9191063940525055, "epoch": 0.7978609625668449, "grad_norm": 0.16626055538654327, "learning_rate": 3.87210068288946e-05, "loss": 0.912, "mean_token_accuracy": 0.7680219113826752, "num_tokens": 48656074.0, "step": 1492 }, { "entropy": 0.9622265249490738, "epoch": 0.7983957219251336, "grad_norm": 0.17625634372234344, "learning_rate": 3.8625898845575224e-05, "loss": 0.956, "mean_token_accuracy": 0.7564660012722015, "num_tokens": 48688629.0, "step": 1493 }, { "entropy": 0.9250707626342773, "epoch": 0.7989304812834225, "grad_norm": 0.16939716041088104, "learning_rate": 3.85310051733633e-05, "loss": 0.9027, "mean_token_accuracy": 0.7660240978002548, "num_tokens": 48721110.0, "step": 1494 }, { "entropy": 0.936187669634819, "epoch": 0.7994652406417112, "grad_norm": 0.17359718680381775, "learning_rate": 3.843632609719088e-05, "loss": 0.9456, "mean_token_accuracy": 0.7620920091867447, "num_tokens": 48753669.0, "step": 1495 }, { "entropy": 1.0146075338125229, "epoch": 0.8, "grad_norm": 0.17324475944042206, "learning_rate": 3.834186190134567e-05, "loss": 1.0144, "mean_token_accuracy": 0.7455156147480011, "num_tokens": 48786164.0, "step": 1496 }, { "entropy": 0.953261598944664, "epoch": 0.8005347593582888, "grad_norm": 0.17196255922317505, "learning_rate": 3.824761286947011e-05, "loss": 0.943, "mean_token_accuracy": 0.7580036818981171, "num_tokens": 48818109.0, "step": 1497 }, { "entropy": 0.9698213785886765, "epoch": 0.8010695187165775, "grad_norm": 0.16612346470355988, "learning_rate": 3.8153579284560606e-05, "loss": 0.9641, "mean_token_accuracy": 0.7526881694793701, "num_tokens": 48850877.0, "step": 1498 }, { "entropy": 0.9340949952602386, "epoch": 0.8016042780748663, "grad_norm": 0.17099621891975403, "learning_rate": 3.8059761428966695e-05, "loss": 0.9346, "mean_token_accuracy": 0.7570564448833466, "num_tokens": 48883645.0, "step": 1499 }, { "entropy": 0.9078283756971359, "epoch": 0.8021390374331551, "grad_norm": 0.1709538996219635, "learning_rate": 3.796615958439009e-05, "loss": 0.9004, "mean_token_accuracy": 0.7695465236902237, "num_tokens": 48916331.0, "step": 1500 } ], "logging_steps": 1, "max_steps": 1870, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.24019845480448e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }