{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 834, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.51171875, "epoch": 0.0036068530207394047, "grad_norm": 0.3030305504798889, "learning_rate": 0.0005, "loss": 2.0211, "mean_token_accuracy": 0.5720161497592926, "num_tokens": 61274.0, "step": 1 }, { "entropy": 1.69140625, "epoch": 0.007213706041478809, "grad_norm": 0.2522773742675781, "learning_rate": 0.0004996402877697842, "loss": 1.8722, "mean_token_accuracy": 0.5848744511604309, "num_tokens": 122121.0, "step": 2 }, { "entropy": 2.453125, "epoch": 0.010820559062218215, "grad_norm": 0.5655789971351624, "learning_rate": 0.0004992805755395683, "loss": 1.8934, "mean_token_accuracy": 0.5764134377241135, "num_tokens": 184888.0, "step": 3 }, { "entropy": 1.845703125, "epoch": 0.014427412082957619, "grad_norm": 0.33911973237991333, "learning_rate": 0.0004989208633093525, "loss": 1.767, "mean_token_accuracy": 0.5942296981811523, "num_tokens": 245749.0, "step": 4 }, { "entropy": 1.587890625, "epoch": 0.018034265103697024, "grad_norm": 0.19839665293693542, "learning_rate": 0.0004985611510791367, "loss": 1.6849, "mean_token_accuracy": 0.603009894490242, "num_tokens": 306941.0, "step": 5 }, { "entropy": 1.619140625, "epoch": 0.02164111812443643, "grad_norm": 0.2460078001022339, "learning_rate": 0.0004982014388489208, "loss": 1.6812, "mean_token_accuracy": 0.6077425181865692, "num_tokens": 368720.0, "step": 6 }, { "entropy": 1.7421875, "epoch": 0.025247971145175834, "grad_norm": 0.19908468425273895, "learning_rate": 0.000497841726618705, "loss": 1.6784, "mean_token_accuracy": 0.6059225052595139, "num_tokens": 428636.0, "step": 7 }, { "entropy": 1.646484375, "epoch": 0.028854824165915238, "grad_norm": 0.12581661343574524, "learning_rate": 0.0004974820143884893, "loss": 1.5537, "mean_token_accuracy": 0.6240372508764267, "num_tokens": 490592.0, "step": 8 }, { "entropy": 1.55078125, "epoch": 0.032461677186654644, "grad_norm": 0.10293233394622803, "learning_rate": 0.0004971223021582734, "loss": 1.5397, "mean_token_accuracy": 0.6273812800645828, "num_tokens": 553465.0, "step": 9 }, { "entropy": 1.533203125, "epoch": 0.03606853020739405, "grad_norm": 0.19007287919521332, "learning_rate": 0.0004967625899280576, "loss": 1.5904, "mean_token_accuracy": 0.6185768842697144, "num_tokens": 614590.0, "step": 10 }, { "entropy": 1.546875, "epoch": 0.03967538322813345, "grad_norm": 0.18794836103916168, "learning_rate": 0.0004964028776978418, "loss": 1.5206, "mean_token_accuracy": 0.626290962100029, "num_tokens": 676632.0, "step": 11 }, { "entropy": 1.595703125, "epoch": 0.04328223624887286, "grad_norm": 0.13097693026065826, "learning_rate": 0.0004960431654676259, "loss": 1.5479, "mean_token_accuracy": 0.6230891793966293, "num_tokens": 737284.0, "step": 12 }, { "entropy": 1.544921875, "epoch": 0.046889089269612265, "grad_norm": 0.10859325528144836, "learning_rate": 0.0004956834532374101, "loss": 1.5123, "mean_token_accuracy": 0.6295682936906815, "num_tokens": 801463.0, "step": 13 }, { "entropy": 1.5390625, "epoch": 0.05049594229035167, "grad_norm": 0.15148380398750305, "learning_rate": 0.0004953237410071943, "loss": 1.5403, "mean_token_accuracy": 0.6239829212427139, "num_tokens": 861335.0, "step": 14 }, { "entropy": 1.564453125, "epoch": 0.05410279531109107, "grad_norm": 0.10483004152774811, "learning_rate": 0.0004949640287769784, "loss": 1.5041, "mean_token_accuracy": 0.6288192272186279, "num_tokens": 923160.0, "step": 15 }, { "entropy": 1.53125, "epoch": 0.057709648331830475, "grad_norm": 0.10613196343183517, "learning_rate": 0.0004946043165467626, "loss": 1.4733, "mean_token_accuracy": 0.6381650120019913, "num_tokens": 984403.0, "step": 16 }, { "entropy": 1.447265625, "epoch": 0.061316501352569885, "grad_norm": 0.10855649411678314, "learning_rate": 0.0004942446043165468, "loss": 1.4582, "mean_token_accuracy": 0.6399344354867935, "num_tokens": 1047341.0, "step": 17 }, { "entropy": 1.44140625, "epoch": 0.06492335437330929, "grad_norm": 0.10314038395881653, "learning_rate": 0.000493884892086331, "loss": 1.4526, "mean_token_accuracy": 0.6409045904874802, "num_tokens": 1108877.0, "step": 18 }, { "entropy": 1.53125, "epoch": 0.06853020739404869, "grad_norm": 0.11326713114976883, "learning_rate": 0.0004935251798561151, "loss": 1.4761, "mean_token_accuracy": 0.6339438855648041, "num_tokens": 1171415.0, "step": 19 }, { "entropy": 1.533203125, "epoch": 0.0721370604147881, "grad_norm": 0.10330658406019211, "learning_rate": 0.0004931654676258993, "loss": 1.4848, "mean_token_accuracy": 0.6318182647228241, "num_tokens": 1233232.0, "step": 20 }, { "entropy": 1.474609375, "epoch": 0.0757439134355275, "grad_norm": 0.13809040188789368, "learning_rate": 0.0004928057553956834, "loss": 1.4819, "mean_token_accuracy": 0.6353569775819778, "num_tokens": 1295155.0, "step": 21 }, { "entropy": 1.501953125, "epoch": 0.0793507664562669, "grad_norm": 0.09941317141056061, "learning_rate": 0.0004924460431654676, "loss": 1.4562, "mean_token_accuracy": 0.6376430094242096, "num_tokens": 1357204.0, "step": 22 }, { "entropy": 1.5546875, "epoch": 0.0829576194770063, "grad_norm": 0.15400655567646027, "learning_rate": 0.0004920863309352518, "loss": 1.4684, "mean_token_accuracy": 0.6349789351224899, "num_tokens": 1417465.0, "step": 23 }, { "entropy": 1.482421875, "epoch": 0.08656447249774572, "grad_norm": 0.10283487290143967, "learning_rate": 0.0004917266187050359, "loss": 1.4501, "mean_token_accuracy": 0.6401519179344177, "num_tokens": 1479099.0, "step": 24 }, { "entropy": 1.408203125, "epoch": 0.09017132551848513, "grad_norm": 0.10001431405544281, "learning_rate": 0.0004913669064748201, "loss": 1.4002, "mean_token_accuracy": 0.6491079777479172, "num_tokens": 1539378.0, "step": 25 }, { "entropy": 1.42578125, "epoch": 0.09377817853922453, "grad_norm": 0.09198654443025589, "learning_rate": 0.0004910071942446044, "loss": 1.4315, "mean_token_accuracy": 0.6429430246353149, "num_tokens": 1602199.0, "step": 26 }, { "entropy": 1.447265625, "epoch": 0.09738503155996393, "grad_norm": 0.11325015127658844, "learning_rate": 0.0004906474820143886, "loss": 1.3883, "mean_token_accuracy": 0.6512083262205124, "num_tokens": 1663219.0, "step": 27 }, { "entropy": 1.443359375, "epoch": 0.10099188458070334, "grad_norm": 0.10132941603660583, "learning_rate": 0.0004902877697841727, "loss": 1.398, "mean_token_accuracy": 0.6499650478363037, "num_tokens": 1724186.0, "step": 28 }, { "entropy": 1.412109375, "epoch": 0.10459873760144274, "grad_norm": 0.09528914093971252, "learning_rate": 0.0004899280575539569, "loss": 1.4192, "mean_token_accuracy": 0.647320106625557, "num_tokens": 1784449.0, "step": 29 }, { "entropy": 1.443359375, "epoch": 0.10820559062218214, "grad_norm": 0.09159240871667862, "learning_rate": 0.000489568345323741, "loss": 1.4108, "mean_token_accuracy": 0.6470601409673691, "num_tokens": 1846053.0, "step": 30 }, { "entropy": 1.4765625, "epoch": 0.11181244364292155, "grad_norm": 0.10358703136444092, "learning_rate": 0.0004892086330935252, "loss": 1.4073, "mean_token_accuracy": 0.6460356265306473, "num_tokens": 1908993.0, "step": 31 }, { "entropy": 1.427734375, "epoch": 0.11541929666366095, "grad_norm": 0.08892232179641724, "learning_rate": 0.0004888489208633094, "loss": 1.391, "mean_token_accuracy": 0.6510511487722397, "num_tokens": 1971352.0, "step": 32 }, { "entropy": 1.416015625, "epoch": 0.11902614968440037, "grad_norm": 0.1008245125412941, "learning_rate": 0.0004884892086330936, "loss": 1.4141, "mean_token_accuracy": 0.6441760659217834, "num_tokens": 2033115.0, "step": 33 }, { "entropy": 1.462890625, "epoch": 0.12263300270513977, "grad_norm": 0.09512194246053696, "learning_rate": 0.0004881294964028777, "loss": 1.4375, "mean_token_accuracy": 0.6436735391616821, "num_tokens": 2095452.0, "step": 34 }, { "entropy": 1.48046875, "epoch": 0.12623985572587917, "grad_norm": 0.10468802601099014, "learning_rate": 0.0004877697841726619, "loss": 1.4295, "mean_token_accuracy": 0.6430869400501251, "num_tokens": 2156878.0, "step": 35 }, { "entropy": 1.45703125, "epoch": 0.12984670874661858, "grad_norm": 0.11346353590488434, "learning_rate": 0.00048741007194244606, "loss": 1.3907, "mean_token_accuracy": 0.6510187089443207, "num_tokens": 2220072.0, "step": 36 }, { "entropy": 1.388671875, "epoch": 0.13345356176735798, "grad_norm": 0.09038154780864716, "learning_rate": 0.0004870503597122302, "loss": 1.3517, "mean_token_accuracy": 0.6563488245010376, "num_tokens": 2280579.0, "step": 37 }, { "entropy": 1.37109375, "epoch": 0.13706041478809738, "grad_norm": 0.0988394021987915, "learning_rate": 0.0004866906474820144, "loss": 1.3685, "mean_token_accuracy": 0.6536091417074203, "num_tokens": 2341392.0, "step": 38 }, { "entropy": 1.423828125, "epoch": 0.1406672678088368, "grad_norm": 0.09189973026514053, "learning_rate": 0.0004863309352517986, "loss": 1.3997, "mean_token_accuracy": 0.6478669345378876, "num_tokens": 2400688.0, "step": 39 }, { "entropy": 1.421875, "epoch": 0.1442741208295762, "grad_norm": 0.09040148556232452, "learning_rate": 0.00048597122302158273, "loss": 1.3877, "mean_token_accuracy": 0.6530875265598297, "num_tokens": 2462303.0, "step": 40 }, { "entropy": 1.435546875, "epoch": 0.1478809738503156, "grad_norm": 0.08843400329351425, "learning_rate": 0.0004856115107913669, "loss": 1.3918, "mean_token_accuracy": 0.6505153328180313, "num_tokens": 2523854.0, "step": 41 }, { "entropy": 1.419921875, "epoch": 0.151487826871055, "grad_norm": 0.0934394896030426, "learning_rate": 0.0004852517985611511, "loss": 1.3917, "mean_token_accuracy": 0.647098571062088, "num_tokens": 2585270.0, "step": 42 }, { "entropy": 1.447265625, "epoch": 0.1550946798917944, "grad_norm": 0.10170190036296844, "learning_rate": 0.0004848920863309353, "loss": 1.4069, "mean_token_accuracy": 0.6488664448261261, "num_tokens": 2648716.0, "step": 43 }, { "entropy": 1.412109375, "epoch": 0.1587015329125338, "grad_norm": 0.09372569620609283, "learning_rate": 0.00048453237410071945, "loss": 1.384, "mean_token_accuracy": 0.6492820531129837, "num_tokens": 2711812.0, "step": 44 }, { "entropy": 1.38671875, "epoch": 0.1623083859332732, "grad_norm": 0.08812659233808517, "learning_rate": 0.0004841726618705036, "loss": 1.3502, "mean_token_accuracy": 0.6552395075559616, "num_tokens": 2772989.0, "step": 45 }, { "entropy": 1.443359375, "epoch": 0.1659152389540126, "grad_norm": 0.1031329333782196, "learning_rate": 0.00048381294964028776, "loss": 1.3944, "mean_token_accuracy": 0.647624284029007, "num_tokens": 2833824.0, "step": 46 }, { "entropy": 1.400390625, "epoch": 0.16952209197475204, "grad_norm": 0.09903182834386826, "learning_rate": 0.00048345323741007197, "loss": 1.3615, "mean_token_accuracy": 0.6557374894618988, "num_tokens": 2895992.0, "step": 47 }, { "entropy": 1.38671875, "epoch": 0.17312894499549145, "grad_norm": 0.09109807759523392, "learning_rate": 0.0004830935251798561, "loss": 1.3628, "mean_token_accuracy": 0.6577312648296356, "num_tokens": 2956715.0, "step": 48 }, { "entropy": 1.423828125, "epoch": 0.17673579801623085, "grad_norm": 0.08922488987445831, "learning_rate": 0.00048273381294964033, "loss": 1.3914, "mean_token_accuracy": 0.6500654071569443, "num_tokens": 3018433.0, "step": 49 }, { "entropy": 1.3984375, "epoch": 0.18034265103697025, "grad_norm": 0.10995634645223618, "learning_rate": 0.00048237410071942443, "loss": 1.3615, "mean_token_accuracy": 0.6526202708482742, "num_tokens": 3079243.0, "step": 50 }, { "entropy": 1.44140625, "epoch": 0.18394950405770966, "grad_norm": 0.09530891478061676, "learning_rate": 0.00048201438848920864, "loss": 1.3976, "mean_token_accuracy": 0.6517702341079712, "num_tokens": 3139744.0, "step": 51 }, { "entropy": 1.37890625, "epoch": 0.18755635707844906, "grad_norm": 0.09168506413698196, "learning_rate": 0.00048165467625899285, "loss": 1.3507, "mean_token_accuracy": 0.6591605395078659, "num_tokens": 3203297.0, "step": 52 }, { "entropy": 1.435546875, "epoch": 0.19116321009918846, "grad_norm": 0.09041737020015717, "learning_rate": 0.000481294964028777, "loss": 1.3785, "mean_token_accuracy": 0.653384655714035, "num_tokens": 3264178.0, "step": 53 }, { "entropy": 1.41796875, "epoch": 0.19477006311992787, "grad_norm": 0.1037738174200058, "learning_rate": 0.00048093525179856116, "loss": 1.3729, "mean_token_accuracy": 0.6526838839054108, "num_tokens": 3325176.0, "step": 54 }, { "entropy": 1.384765625, "epoch": 0.19837691614066727, "grad_norm": 0.08338800817728043, "learning_rate": 0.0004805755395683453, "loss": 1.3551, "mean_token_accuracy": 0.6567304730415344, "num_tokens": 3387667.0, "step": 55 }, { "entropy": 1.359375, "epoch": 0.20198376916140667, "grad_norm": 0.10195907950401306, "learning_rate": 0.0004802158273381295, "loss": 1.353, "mean_token_accuracy": 0.6594225764274597, "num_tokens": 3449271.0, "step": 56 }, { "entropy": 1.361328125, "epoch": 0.20559062218214608, "grad_norm": 0.08948971331119537, "learning_rate": 0.0004798561151079137, "loss": 1.3463, "mean_token_accuracy": 0.6569480001926422, "num_tokens": 3510714.0, "step": 57 }, { "entropy": 1.41015625, "epoch": 0.20919747520288548, "grad_norm": 0.09201383590698242, "learning_rate": 0.0004794964028776979, "loss": 1.3592, "mean_token_accuracy": 0.6537477076053619, "num_tokens": 3573350.0, "step": 58 }, { "entropy": 1.4296875, "epoch": 0.21280432822362488, "grad_norm": 0.10949582606554031, "learning_rate": 0.000479136690647482, "loss": 1.3527, "mean_token_accuracy": 0.6561050117015839, "num_tokens": 3635259.0, "step": 59 }, { "entropy": 1.392578125, "epoch": 0.2164111812443643, "grad_norm": 0.09264568984508514, "learning_rate": 0.0004787769784172662, "loss": 1.3753, "mean_token_accuracy": 0.6493556350469589, "num_tokens": 3695384.0, "step": 60 }, { "entropy": 1.337890625, "epoch": 0.2200180342651037, "grad_norm": 0.12051510810852051, "learning_rate": 0.0004784172661870504, "loss": 1.3439, "mean_token_accuracy": 0.6585545241832733, "num_tokens": 3758066.0, "step": 61 }, { "entropy": 1.39453125, "epoch": 0.2236248872858431, "grad_norm": 0.09623441100120544, "learning_rate": 0.00047805755395683455, "loss": 1.363, "mean_token_accuracy": 0.6535609513521194, "num_tokens": 3818688.0, "step": 62 }, { "entropy": 1.40625, "epoch": 0.2272317403065825, "grad_norm": 0.11702005565166473, "learning_rate": 0.0004776978417266187, "loss": 1.3387, "mean_token_accuracy": 0.6610676646232605, "num_tokens": 3878732.0, "step": 63 }, { "entropy": 1.3671875, "epoch": 0.2308385933273219, "grad_norm": 0.10394135117530823, "learning_rate": 0.00047733812949640286, "loss": 1.3141, "mean_token_accuracy": 0.6657440066337585, "num_tokens": 3939430.0, "step": 64 }, { "entropy": 1.373046875, "epoch": 0.2344454463480613, "grad_norm": 0.11498216539621353, "learning_rate": 0.00047697841726618707, "loss": 1.3623, "mean_token_accuracy": 0.6541779190301895, "num_tokens": 4001344.0, "step": 65 }, { "entropy": 1.3984375, "epoch": 0.23805229936880073, "grad_norm": 0.09205884486436844, "learning_rate": 0.0004766187050359712, "loss": 1.3532, "mean_token_accuracy": 0.6557832509279251, "num_tokens": 4061387.0, "step": 66 }, { "entropy": 1.412109375, "epoch": 0.24165915238954014, "grad_norm": 0.11706861108541489, "learning_rate": 0.00047625899280575543, "loss": 1.3624, "mean_token_accuracy": 0.6543096750974655, "num_tokens": 4121966.0, "step": 67 }, { "entropy": 1.296875, "epoch": 0.24526600541027954, "grad_norm": 0.09137823432683945, "learning_rate": 0.0004758992805755396, "loss": 1.2719, "mean_token_accuracy": 0.6716153919696808, "num_tokens": 4183486.0, "step": 68 }, { "entropy": 1.3125, "epoch": 0.24887285843101895, "grad_norm": 0.14280101656913757, "learning_rate": 0.00047553956834532374, "loss": 1.3098, "mean_token_accuracy": 0.6644623428583145, "num_tokens": 4245779.0, "step": 69 }, { "entropy": 1.3828125, "epoch": 0.25247971145175835, "grad_norm": 0.0973227247595787, "learning_rate": 0.00047517985611510795, "loss": 1.3329, "mean_token_accuracy": 0.660480409860611, "num_tokens": 4306257.0, "step": 70 }, { "entropy": 1.400390625, "epoch": 0.2560865644724977, "grad_norm": 0.11058962345123291, "learning_rate": 0.0004748201438848921, "loss": 1.3508, "mean_token_accuracy": 0.6556032747030258, "num_tokens": 4367155.0, "step": 71 }, { "entropy": 1.33203125, "epoch": 0.25969341749323716, "grad_norm": 0.11350785195827484, "learning_rate": 0.00047446043165467626, "loss": 1.2935, "mean_token_accuracy": 0.6670258790254593, "num_tokens": 4429816.0, "step": 72 }, { "entropy": 1.369140625, "epoch": 0.26330027051397653, "grad_norm": 0.12396484613418579, "learning_rate": 0.0004741007194244604, "loss": 1.3822, "mean_token_accuracy": 0.6516790390014648, "num_tokens": 4491181.0, "step": 73 }, { "entropy": 1.330078125, "epoch": 0.26690712353471596, "grad_norm": 0.09880026429891586, "learning_rate": 0.0004737410071942446, "loss": 1.3086, "mean_token_accuracy": 0.6620372384786606, "num_tokens": 4552029.0, "step": 74 }, { "entropy": 1.376953125, "epoch": 0.27051397655545534, "grad_norm": 0.09168411046266556, "learning_rate": 0.0004733812949640288, "loss": 1.3147, "mean_token_accuracy": 0.6649867743253708, "num_tokens": 4613328.0, "step": 75 }, { "entropy": 1.40625, "epoch": 0.27412082957619477, "grad_norm": 0.10424616932868958, "learning_rate": 0.000473021582733813, "loss": 1.3392, "mean_token_accuracy": 0.6619475185871124, "num_tokens": 4674281.0, "step": 76 }, { "entropy": 1.361328125, "epoch": 0.2777276825969342, "grad_norm": 0.09615188837051392, "learning_rate": 0.00047266187050359714, "loss": 1.3179, "mean_token_accuracy": 0.6640816926956177, "num_tokens": 4735735.0, "step": 77 }, { "entropy": 1.369140625, "epoch": 0.2813345356176736, "grad_norm": 0.1020839661359787, "learning_rate": 0.0004723021582733813, "loss": 1.3599, "mean_token_accuracy": 0.6606272757053375, "num_tokens": 4797654.0, "step": 78 }, { "entropy": 1.373046875, "epoch": 0.284941388638413, "grad_norm": 0.10847945511341095, "learning_rate": 0.0004719424460431655, "loss": 1.336, "mean_token_accuracy": 0.6600470244884491, "num_tokens": 4860635.0, "step": 79 }, { "entropy": 1.3828125, "epoch": 0.2885482416591524, "grad_norm": 0.10152547061443329, "learning_rate": 0.00047158273381294965, "loss": 1.3197, "mean_token_accuracy": 0.6612180024385452, "num_tokens": 4922030.0, "step": 80 }, { "entropy": 1.349609375, "epoch": 0.2921550946798918, "grad_norm": 0.10306407511234283, "learning_rate": 0.00047122302158273386, "loss": 1.3187, "mean_token_accuracy": 0.6631215661764145, "num_tokens": 4982377.0, "step": 81 }, { "entropy": 1.302734375, "epoch": 0.2957619477006312, "grad_norm": 0.10228051245212555, "learning_rate": 0.00047086330935251796, "loss": 1.3083, "mean_token_accuracy": 0.6664199531078339, "num_tokens": 5045058.0, "step": 82 }, { "entropy": 1.33984375, "epoch": 0.2993688007213706, "grad_norm": 0.10302013903856277, "learning_rate": 0.00047050359712230217, "loss": 1.3357, "mean_token_accuracy": 0.6621494442224503, "num_tokens": 5105905.0, "step": 83 }, { "entropy": 1.392578125, "epoch": 0.30297565374211, "grad_norm": 0.09598083794116974, "learning_rate": 0.0004701438848920863, "loss": 1.3461, "mean_token_accuracy": 0.6572457700967789, "num_tokens": 5167597.0, "step": 84 }, { "entropy": 1.3984375, "epoch": 0.3065825067628494, "grad_norm": 0.10638178139925003, "learning_rate": 0.00046978417266187053, "loss": 1.3294, "mean_token_accuracy": 0.6597902178764343, "num_tokens": 5229270.0, "step": 85 }, { "entropy": 1.3515625, "epoch": 0.3101893597835888, "grad_norm": 0.0952412411570549, "learning_rate": 0.0004694244604316547, "loss": 1.3049, "mean_token_accuracy": 0.6617357581853867, "num_tokens": 5291138.0, "step": 86 }, { "entropy": 1.3046875, "epoch": 0.31379621280432823, "grad_norm": 0.09313120692968369, "learning_rate": 0.00046906474820143884, "loss": 1.2804, "mean_token_accuracy": 0.6725138425827026, "num_tokens": 5352298.0, "step": 87 }, { "entropy": 1.376953125, "epoch": 0.3174030658250676, "grad_norm": 0.10185771435499191, "learning_rate": 0.00046870503597122305, "loss": 1.3517, "mean_token_accuracy": 0.6558056026697159, "num_tokens": 5413742.0, "step": 88 }, { "entropy": 1.34765625, "epoch": 0.32100991884580704, "grad_norm": 0.10081026703119278, "learning_rate": 0.0004683453237410072, "loss": 1.3171, "mean_token_accuracy": 0.6634920239448547, "num_tokens": 5476096.0, "step": 89 }, { "entropy": 1.326171875, "epoch": 0.3246167718665464, "grad_norm": 0.09721463173627853, "learning_rate": 0.0004679856115107914, "loss": 1.3085, "mean_token_accuracy": 0.6626997590065002, "num_tokens": 5535661.0, "step": 90 }, { "entropy": 1.330078125, "epoch": 0.32822362488728585, "grad_norm": 0.09751971065998077, "learning_rate": 0.0004676258992805755, "loss": 1.3005, "mean_token_accuracy": 0.6645149141550064, "num_tokens": 5596979.0, "step": 91 }, { "entropy": 1.36328125, "epoch": 0.3318304779080252, "grad_norm": 0.09325600415468216, "learning_rate": 0.0004672661870503597, "loss": 1.3158, "mean_token_accuracy": 0.6647576689720154, "num_tokens": 5658914.0, "step": 92 }, { "entropy": 1.349609375, "epoch": 0.33543733092876465, "grad_norm": 0.10252340883016586, "learning_rate": 0.0004669064748201439, "loss": 1.3227, "mean_token_accuracy": 0.6605348736047745, "num_tokens": 5720582.0, "step": 93 }, { "entropy": 1.361328125, "epoch": 0.3390441839495041, "grad_norm": 0.09599050879478455, "learning_rate": 0.0004665467625899281, "loss": 1.3012, "mean_token_accuracy": 0.6683944463729858, "num_tokens": 5781040.0, "step": 94 }, { "entropy": 1.353515625, "epoch": 0.34265103697024346, "grad_norm": 0.10135883837938309, "learning_rate": 0.00046618705035971224, "loss": 1.3161, "mean_token_accuracy": 0.6641709208488464, "num_tokens": 5841990.0, "step": 95 }, { "entropy": 1.353515625, "epoch": 0.3462578899909829, "grad_norm": 0.10210946202278137, "learning_rate": 0.0004658273381294964, "loss": 1.3344, "mean_token_accuracy": 0.6593538820743561, "num_tokens": 5903024.0, "step": 96 }, { "entropy": 1.330078125, "epoch": 0.34986474301172227, "grad_norm": 0.09815412759780884, "learning_rate": 0.0004654676258992806, "loss": 1.2982, "mean_token_accuracy": 0.6672879159450531, "num_tokens": 5964325.0, "step": 97 }, { "entropy": 1.359375, "epoch": 0.3534715960324617, "grad_norm": 0.11357366293668747, "learning_rate": 0.00046510791366906475, "loss": 1.3511, "mean_token_accuracy": 0.6560703068971634, "num_tokens": 6024306.0, "step": 98 }, { "entropy": 1.373046875, "epoch": 0.3570784490532011, "grad_norm": 0.11732451617717743, "learning_rate": 0.00046474820143884896, "loss": 1.3307, "mean_token_accuracy": 0.6588790416717529, "num_tokens": 6084203.0, "step": 99 }, { "entropy": 1.380859375, "epoch": 0.3606853020739405, "grad_norm": 0.10406817495822906, "learning_rate": 0.00046438848920863306, "loss": 1.3308, "mean_token_accuracy": 0.6600909531116486, "num_tokens": 6147249.0, "step": 100 }, { "entropy": 1.359375, "epoch": 0.3642921550946799, "grad_norm": 0.11708933860063553, "learning_rate": 0.00046402877697841727, "loss": 1.3286, "mean_token_accuracy": 0.6616474837064743, "num_tokens": 6207963.0, "step": 101 }, { "entropy": 1.33203125, "epoch": 0.3678990081154193, "grad_norm": 0.09779378026723862, "learning_rate": 0.0004636690647482015, "loss": 1.289, "mean_token_accuracy": 0.6661412119865417, "num_tokens": 6268932.0, "step": 102 }, { "entropy": 1.4375, "epoch": 0.3715058611361587, "grad_norm": 0.1120646744966507, "learning_rate": 0.00046330935251798563, "loss": 1.3908, "mean_token_accuracy": 0.6479608565568924, "num_tokens": 6328355.0, "step": 103 }, { "entropy": 1.33203125, "epoch": 0.3751127141568981, "grad_norm": 0.11415059119462967, "learning_rate": 0.0004629496402877698, "loss": 1.2915, "mean_token_accuracy": 0.6661449372768402, "num_tokens": 6389770.0, "step": 104 }, { "entropy": 1.298828125, "epoch": 0.3787195671776375, "grad_norm": 0.10884244740009308, "learning_rate": 0.00046258992805755394, "loss": 1.2729, "mean_token_accuracy": 0.671757385134697, "num_tokens": 6449709.0, "step": 105 }, { "entropy": 1.33984375, "epoch": 0.3823264201983769, "grad_norm": 0.1008341908454895, "learning_rate": 0.00046223021582733815, "loss": 1.3102, "mean_token_accuracy": 0.6624729633331299, "num_tokens": 6510238.0, "step": 106 }, { "entropy": 1.306640625, "epoch": 0.3859332732191163, "grad_norm": 0.10561376810073853, "learning_rate": 0.0004618705035971223, "loss": 1.2878, "mean_token_accuracy": 0.6654368340969086, "num_tokens": 6571108.0, "step": 107 }, { "entropy": 1.345703125, "epoch": 0.38954012623985573, "grad_norm": 0.09814321994781494, "learning_rate": 0.0004615107913669065, "loss": 1.317, "mean_token_accuracy": 0.6615089625120163, "num_tokens": 6633545.0, "step": 108 }, { "entropy": 1.322265625, "epoch": 0.3931469792605951, "grad_norm": 0.11003340780735016, "learning_rate": 0.0004611510791366906, "loss": 1.2906, "mean_token_accuracy": 0.6662724167108536, "num_tokens": 6695569.0, "step": 109 }, { "entropy": 1.328125, "epoch": 0.39675383228133454, "grad_norm": 0.10055093467235565, "learning_rate": 0.0004607913669064748, "loss": 1.2801, "mean_token_accuracy": 0.671228438615799, "num_tokens": 6758035.0, "step": 110 }, { "entropy": 1.33203125, "epoch": 0.4003606853020739, "grad_norm": 0.10745132714509964, "learning_rate": 0.00046043165467625903, "loss": 1.2926, "mean_token_accuracy": 0.6671402305364609, "num_tokens": 6819543.0, "step": 111 }, { "entropy": 1.283203125, "epoch": 0.40396753832281335, "grad_norm": 0.10604678094387054, "learning_rate": 0.0004600719424460432, "loss": 1.2595, "mean_token_accuracy": 0.6724193096160889, "num_tokens": 6881830.0, "step": 112 }, { "entropy": 1.322265625, "epoch": 0.4075743913435528, "grad_norm": 0.10519695281982422, "learning_rate": 0.0004597122302158274, "loss": 1.2935, "mean_token_accuracy": 0.6699620187282562, "num_tokens": 6941538.0, "step": 113 }, { "entropy": 1.306640625, "epoch": 0.41118124436429215, "grad_norm": 0.1009632796049118, "learning_rate": 0.0004593525179856115, "loss": 1.2755, "mean_token_accuracy": 0.6686723083257675, "num_tokens": 7002824.0, "step": 114 }, { "entropy": 1.3671875, "epoch": 0.4147880973850316, "grad_norm": 0.10454940795898438, "learning_rate": 0.0004589928057553957, "loss": 1.3241, "mean_token_accuracy": 0.6622076481580734, "num_tokens": 7063154.0, "step": 115 }, { "entropy": 1.3515625, "epoch": 0.41839495040577096, "grad_norm": 0.11023333668708801, "learning_rate": 0.00045863309352517985, "loss": 1.3087, "mean_token_accuracy": 0.661966547369957, "num_tokens": 7125859.0, "step": 116 }, { "entropy": 1.26171875, "epoch": 0.4220018034265104, "grad_norm": 0.1078903079032898, "learning_rate": 0.00045827338129496406, "loss": 1.2394, "mean_token_accuracy": 0.6771416813135147, "num_tokens": 7187841.0, "step": 117 }, { "entropy": 1.2734375, "epoch": 0.42560865644724977, "grad_norm": 0.11535125225782394, "learning_rate": 0.00045791366906474816, "loss": 1.2788, "mean_token_accuracy": 0.6726923733949661, "num_tokens": 7248711.0, "step": 118 }, { "entropy": 1.369140625, "epoch": 0.4292155094679892, "grad_norm": 0.10514269024133682, "learning_rate": 0.00045755395683453237, "loss": 1.3365, "mean_token_accuracy": 0.6604026705026627, "num_tokens": 7309167.0, "step": 119 }, { "entropy": 1.369140625, "epoch": 0.4328223624887286, "grad_norm": 0.12499909847974777, "learning_rate": 0.0004571942446043166, "loss": 1.2883, "mean_token_accuracy": 0.6659924685955048, "num_tokens": 7370511.0, "step": 120 }, { "entropy": 1.392578125, "epoch": 0.436429215509468, "grad_norm": 0.10083161294460297, "learning_rate": 0.00045683453237410073, "loss": 1.3324, "mean_token_accuracy": 0.6608517318964005, "num_tokens": 7430420.0, "step": 121 }, { "entropy": 1.330078125, "epoch": 0.4400360685302074, "grad_norm": 0.10808581858873367, "learning_rate": 0.00045647482014388494, "loss": 1.3078, "mean_token_accuracy": 0.663190171122551, "num_tokens": 7490956.0, "step": 122 }, { "entropy": 1.345703125, "epoch": 0.4436429215509468, "grad_norm": 0.1062081828713417, "learning_rate": 0.00045611510791366904, "loss": 1.3246, "mean_token_accuracy": 0.6602690368890762, "num_tokens": 7553388.0, "step": 123 }, { "entropy": 1.314453125, "epoch": 0.4472497745716862, "grad_norm": 0.10667255520820618, "learning_rate": 0.00045575539568345325, "loss": 1.2867, "mean_token_accuracy": 0.6678967773914337, "num_tokens": 7615577.0, "step": 124 }, { "entropy": 1.33203125, "epoch": 0.4508566275924256, "grad_norm": 0.10114851593971252, "learning_rate": 0.0004553956834532374, "loss": 1.3021, "mean_token_accuracy": 0.6650511771440506, "num_tokens": 7676622.0, "step": 125 }, { "entropy": 1.3359375, "epoch": 0.454463480613165, "grad_norm": 0.10652799159288406, "learning_rate": 0.0004550359712230216, "loss": 1.3016, "mean_token_accuracy": 0.6651318669319153, "num_tokens": 7737022.0, "step": 126 }, { "entropy": 1.322265625, "epoch": 0.4580703336339044, "grad_norm": 0.0991833508014679, "learning_rate": 0.00045467625899280577, "loss": 1.2886, "mean_token_accuracy": 0.6671187281608582, "num_tokens": 7797413.0, "step": 127 }, { "entropy": 1.34375, "epoch": 0.4616771866546438, "grad_norm": 0.10918841511011124, "learning_rate": 0.0004543165467625899, "loss": 1.277, "mean_token_accuracy": 0.6704264134168625, "num_tokens": 7857281.0, "step": 128 }, { "entropy": 1.349609375, "epoch": 0.46528403967538323, "grad_norm": 0.09825213253498077, "learning_rate": 0.00045395683453237413, "loss": 1.3085, "mean_token_accuracy": 0.6656972467899323, "num_tokens": 7920629.0, "step": 129 }, { "entropy": 1.353515625, "epoch": 0.4688908926961226, "grad_norm": 0.10464514791965485, "learning_rate": 0.0004535971223021583, "loss": 1.3115, "mean_token_accuracy": 0.6640525758266449, "num_tokens": 7982092.0, "step": 130 }, { "entropy": 1.27734375, "epoch": 0.47249774571686204, "grad_norm": 0.11285334080457687, "learning_rate": 0.0004532374100719425, "loss": 1.2584, "mean_token_accuracy": 0.6705505102872849, "num_tokens": 8045122.0, "step": 131 }, { "entropy": 1.279296875, "epoch": 0.47610459873760147, "grad_norm": 0.12269479781389236, "learning_rate": 0.0004528776978417266, "loss": 1.2656, "mean_token_accuracy": 0.6705341041088104, "num_tokens": 8105679.0, "step": 132 }, { "entropy": 1.30078125, "epoch": 0.47971145175834085, "grad_norm": 0.11836866289377213, "learning_rate": 0.0004525179856115108, "loss": 1.2757, "mean_token_accuracy": 0.6688458025455475, "num_tokens": 8168049.0, "step": 133 }, { "entropy": 1.328125, "epoch": 0.4833183047790803, "grad_norm": 0.10456886142492294, "learning_rate": 0.00045215827338129495, "loss": 1.2815, "mean_token_accuracy": 0.6720596551895142, "num_tokens": 8230868.0, "step": 134 }, { "entropy": 1.31640625, "epoch": 0.48692515779981965, "grad_norm": 0.10691170394420624, "learning_rate": 0.00045179856115107916, "loss": 1.2567, "mean_token_accuracy": 0.673307329416275, "num_tokens": 8291423.0, "step": 135 }, { "entropy": 1.333984375, "epoch": 0.4905320108205591, "grad_norm": 0.12394890189170837, "learning_rate": 0.0004514388489208633, "loss": 1.2739, "mean_token_accuracy": 0.6744171977043152, "num_tokens": 8352778.0, "step": 136 }, { "entropy": 1.294921875, "epoch": 0.49413886384129846, "grad_norm": 0.10053418576717377, "learning_rate": 0.00045107913669064747, "loss": 1.2683, "mean_token_accuracy": 0.6731529384851456, "num_tokens": 8416331.0, "step": 137 }, { "entropy": 1.240234375, "epoch": 0.4977457168620379, "grad_norm": 0.10026815533638, "learning_rate": 0.0004507194244604317, "loss": 1.236, "mean_token_accuracy": 0.6783836036920547, "num_tokens": 8478471.0, "step": 138 }, { "entropy": 1.341796875, "epoch": 0.5013525698827773, "grad_norm": 0.1067981943488121, "learning_rate": 0.00045035971223021583, "loss": 1.3217, "mean_token_accuracy": 0.66177998483181, "num_tokens": 8539339.0, "step": 139 }, { "entropy": 1.310546875, "epoch": 0.5049594229035167, "grad_norm": 0.10105327516794205, "learning_rate": 0.00045000000000000004, "loss": 1.2858, "mean_token_accuracy": 0.6682990491390228, "num_tokens": 8602103.0, "step": 140 }, { "entropy": 1.3515625, "epoch": 0.5085662759242561, "grad_norm": 0.1087937206029892, "learning_rate": 0.00044964028776978414, "loss": 1.2982, "mean_token_accuracy": 0.666653111577034, "num_tokens": 8662070.0, "step": 141 }, { "entropy": 1.3125, "epoch": 0.5121731289449954, "grad_norm": 0.1158578172326088, "learning_rate": 0.00044928057553956835, "loss": 1.2583, "mean_token_accuracy": 0.6763561964035034, "num_tokens": 8721693.0, "step": 142 }, { "entropy": 1.291015625, "epoch": 0.5157799819657349, "grad_norm": 0.103072389960289, "learning_rate": 0.0004489208633093525, "loss": 1.2772, "mean_token_accuracy": 0.6729309260845184, "num_tokens": 8785498.0, "step": 143 }, { "entropy": 1.287109375, "epoch": 0.5193868349864743, "grad_norm": 0.10714124888181686, "learning_rate": 0.0004485611510791367, "loss": 1.272, "mean_token_accuracy": 0.6703799515962601, "num_tokens": 8847247.0, "step": 144 }, { "entropy": 1.298828125, "epoch": 0.5229936880072137, "grad_norm": 0.10518942028284073, "learning_rate": 0.0004482014388489209, "loss": 1.2715, "mean_token_accuracy": 0.6729042083024979, "num_tokens": 8908125.0, "step": 145 }, { "entropy": 1.330078125, "epoch": 0.5266005410279531, "grad_norm": 0.1153033971786499, "learning_rate": 0.000447841726618705, "loss": 1.2634, "mean_token_accuracy": 0.6737408488988876, "num_tokens": 8970456.0, "step": 146 }, { "entropy": 1.365234375, "epoch": 0.5302073940486925, "grad_norm": 0.11221577227115631, "learning_rate": 0.00044748201438848923, "loss": 1.3239, "mean_token_accuracy": 0.6609526425600052, "num_tokens": 9031783.0, "step": 147 }, { "entropy": 1.30859375, "epoch": 0.5338142470694319, "grad_norm": 0.11235193908214569, "learning_rate": 0.0004471223021582734, "loss": 1.2884, "mean_token_accuracy": 0.6696923077106476, "num_tokens": 9094270.0, "step": 148 }, { "entropy": 1.3046875, "epoch": 0.5374211000901713, "grad_norm": 0.10395680367946625, "learning_rate": 0.0004467625899280576, "loss": 1.2788, "mean_token_accuracy": 0.6721493750810623, "num_tokens": 9154654.0, "step": 149 }, { "entropy": 1.36328125, "epoch": 0.5410279531109107, "grad_norm": 0.10595784336328506, "learning_rate": 0.0004464028776978417, "loss": 1.3115, "mean_token_accuracy": 0.662454828619957, "num_tokens": 9214558.0, "step": 150 }, { "entropy": 1.310546875, "epoch": 0.5446348061316502, "grad_norm": 0.10546643286943436, "learning_rate": 0.0004460431654676259, "loss": 1.2509, "mean_token_accuracy": 0.6778619438409805, "num_tokens": 9275024.0, "step": 151 }, { "entropy": 1.2734375, "epoch": 0.5482416591523895, "grad_norm": 0.10420048236846924, "learning_rate": 0.0004456834532374101, "loss": 1.2422, "mean_token_accuracy": 0.6751627922058105, "num_tokens": 9334740.0, "step": 152 }, { "entropy": 1.26171875, "epoch": 0.5518485121731289, "grad_norm": 0.10868912190198898, "learning_rate": 0.00044532374100719426, "loss": 1.2629, "mean_token_accuracy": 0.6736222505569458, "num_tokens": 9395793.0, "step": 153 }, { "entropy": 1.3046875, "epoch": 0.5554553651938684, "grad_norm": 0.10851168632507324, "learning_rate": 0.00044496402877697847, "loss": 1.2886, "mean_token_accuracy": 0.665636345744133, "num_tokens": 9457546.0, "step": 154 }, { "entropy": 1.361328125, "epoch": 0.5590622182146078, "grad_norm": 0.11089115589857101, "learning_rate": 0.00044460431654676257, "loss": 1.3032, "mean_token_accuracy": 0.6642045229673386, "num_tokens": 9516338.0, "step": 155 }, { "entropy": 1.2890625, "epoch": 0.5626690712353472, "grad_norm": 0.12984700500965118, "learning_rate": 0.0004442446043165468, "loss": 1.2355, "mean_token_accuracy": 0.6764592975378036, "num_tokens": 9577259.0, "step": 156 }, { "entropy": 1.306640625, "epoch": 0.5662759242560865, "grad_norm": 0.10849939286708832, "learning_rate": 0.00044388489208633093, "loss": 1.2857, "mean_token_accuracy": 0.668328195810318, "num_tokens": 9637350.0, "step": 157 }, { "entropy": 1.25, "epoch": 0.569882777276826, "grad_norm": 0.12273920327425003, "learning_rate": 0.00044352517985611514, "loss": 1.2593, "mean_token_accuracy": 0.673890545964241, "num_tokens": 9700423.0, "step": 158 }, { "entropy": 1.33203125, "epoch": 0.5734896302975654, "grad_norm": 0.11092253029346466, "learning_rate": 0.00044316546762589924, "loss": 1.3047, "mean_token_accuracy": 0.6693979352712631, "num_tokens": 9761931.0, "step": 159 }, { "entropy": 1.349609375, "epoch": 0.5770964833183048, "grad_norm": 0.10293691605329514, "learning_rate": 0.00044280575539568345, "loss": 1.2911, "mean_token_accuracy": 0.6668676137924194, "num_tokens": 9824633.0, "step": 160 }, { "entropy": 1.306640625, "epoch": 0.5807033363390441, "grad_norm": 0.11410611122846603, "learning_rate": 0.00044244604316546766, "loss": 1.2424, "mean_token_accuracy": 0.6762041449546814, "num_tokens": 9886261.0, "step": 161 }, { "entropy": 1.24609375, "epoch": 0.5843101893597836, "grad_norm": 0.11119683086872101, "learning_rate": 0.0004420863309352518, "loss": 1.199, "mean_token_accuracy": 0.6824366599321365, "num_tokens": 9947557.0, "step": 162 }, { "entropy": 1.251953125, "epoch": 0.587917042380523, "grad_norm": 0.11349198967218399, "learning_rate": 0.000441726618705036, "loss": 1.2484, "mean_token_accuracy": 0.6744900643825531, "num_tokens": 10010152.0, "step": 163 }, { "entropy": 1.2578125, "epoch": 0.5915238954012624, "grad_norm": 0.1155005395412445, "learning_rate": 0.0004413669064748201, "loss": 1.2618, "mean_token_accuracy": 0.6732602715492249, "num_tokens": 10072356.0, "step": 164 }, { "entropy": 1.2734375, "epoch": 0.5951307484220018, "grad_norm": 0.10886438935995102, "learning_rate": 0.00044100719424460433, "loss": 1.2476, "mean_token_accuracy": 0.6733746379613876, "num_tokens": 10133572.0, "step": 165 }, { "entropy": 1.3203125, "epoch": 0.5987376014427412, "grad_norm": 0.10490305721759796, "learning_rate": 0.0004406474820143885, "loss": 1.2597, "mean_token_accuracy": 0.6712180376052856, "num_tokens": 10195202.0, "step": 166 }, { "entropy": 1.337890625, "epoch": 0.6023444544634806, "grad_norm": 0.10198364406824112, "learning_rate": 0.0004402877697841727, "loss": 1.2759, "mean_token_accuracy": 0.6736036986112595, "num_tokens": 10256348.0, "step": 167 }, { "entropy": 1.330078125, "epoch": 0.60595130748422, "grad_norm": 0.11529339104890823, "learning_rate": 0.0004399280575539568, "loss": 1.3059, "mean_token_accuracy": 0.6684508770704269, "num_tokens": 10316224.0, "step": 168 }, { "entropy": 1.263671875, "epoch": 0.6095581605049594, "grad_norm": 0.10373765975236893, "learning_rate": 0.000439568345323741, "loss": 1.2238, "mean_token_accuracy": 0.6790874004364014, "num_tokens": 10378237.0, "step": 169 }, { "entropy": 1.3125, "epoch": 0.6131650135256989, "grad_norm": 0.10439463704824448, "learning_rate": 0.0004392086330935252, "loss": 1.2815, "mean_token_accuracy": 0.668475329875946, "num_tokens": 10438808.0, "step": 170 }, { "entropy": 1.28125, "epoch": 0.6167718665464382, "grad_norm": 0.10318120568990707, "learning_rate": 0.00043884892086330936, "loss": 1.2472, "mean_token_accuracy": 0.6763654053211212, "num_tokens": 10499745.0, "step": 171 }, { "entropy": 1.265625, "epoch": 0.6203787195671776, "grad_norm": 0.10287714004516602, "learning_rate": 0.00043848920863309357, "loss": 1.2373, "mean_token_accuracy": 0.6803200244903564, "num_tokens": 10562896.0, "step": 172 }, { "entropy": 1.294921875, "epoch": 0.6239855725879171, "grad_norm": 0.11654017120599747, "learning_rate": 0.00043812949640287767, "loss": 1.2641, "mean_token_accuracy": 0.6731261312961578, "num_tokens": 10625508.0, "step": 173 }, { "entropy": 1.31640625, "epoch": 0.6275924256086565, "grad_norm": 0.10752756893634796, "learning_rate": 0.0004377697841726619, "loss": 1.2854, "mean_token_accuracy": 0.6672516316175461, "num_tokens": 10687528.0, "step": 174 }, { "entropy": 1.322265625, "epoch": 0.6311992786293958, "grad_norm": 0.10769953578710556, "learning_rate": 0.00043741007194244603, "loss": 1.2858, "mean_token_accuracy": 0.6636885404586792, "num_tokens": 10746945.0, "step": 175 }, { "entropy": 1.279296875, "epoch": 0.6348061316501352, "grad_norm": 0.1083042174577713, "learning_rate": 0.00043705035971223024, "loss": 1.2494, "mean_token_accuracy": 0.6755568981170654, "num_tokens": 10809106.0, "step": 176 }, { "entropy": 1.33203125, "epoch": 0.6384129846708747, "grad_norm": 0.10725415498018265, "learning_rate": 0.00043669064748201445, "loss": 1.2937, "mean_token_accuracy": 0.6635169833898544, "num_tokens": 10871650.0, "step": 177 }, { "entropy": 1.251953125, "epoch": 0.6420198376916141, "grad_norm": 0.1022438332438469, "learning_rate": 0.00043633093525179855, "loss": 1.2238, "mean_token_accuracy": 0.6777883321046829, "num_tokens": 10933760.0, "step": 178 }, { "entropy": 1.302734375, "epoch": 0.6456266907123535, "grad_norm": 0.1344289928674698, "learning_rate": 0.00043597122302158276, "loss": 1.274, "mean_token_accuracy": 0.6707887947559357, "num_tokens": 10994803.0, "step": 179 }, { "entropy": 1.3046875, "epoch": 0.6492335437330928, "grad_norm": 0.12509560585021973, "learning_rate": 0.0004356115107913669, "loss": 1.2718, "mean_token_accuracy": 0.6688745021820068, "num_tokens": 11056749.0, "step": 180 }, { "entropy": 1.314453125, "epoch": 0.6528403967538323, "grad_norm": 0.10664302855730057, "learning_rate": 0.0004352517985611511, "loss": 1.2637, "mean_token_accuracy": 0.6712820082902908, "num_tokens": 11118871.0, "step": 181 }, { "entropy": 1.2734375, "epoch": 0.6564472497745717, "grad_norm": 0.1193428486585617, "learning_rate": 0.0004348920863309352, "loss": 1.2408, "mean_token_accuracy": 0.6793081015348434, "num_tokens": 11180512.0, "step": 182 }, { "entropy": 1.255859375, "epoch": 0.6600541027953111, "grad_norm": 0.10876548290252686, "learning_rate": 0.00043453237410071943, "loss": 1.2282, "mean_token_accuracy": 0.6800314038991928, "num_tokens": 11242101.0, "step": 183 }, { "entropy": 1.2578125, "epoch": 0.6636609558160504, "grad_norm": 0.1118224710226059, "learning_rate": 0.0004341726618705036, "loss": 1.249, "mean_token_accuracy": 0.6716993898153305, "num_tokens": 11305083.0, "step": 184 }, { "entropy": 1.294921875, "epoch": 0.6672678088367899, "grad_norm": 0.11248785257339478, "learning_rate": 0.0004338129496402878, "loss": 1.2829, "mean_token_accuracy": 0.6704527139663696, "num_tokens": 11366468.0, "step": 185 }, { "entropy": 1.27734375, "epoch": 0.6708746618575293, "grad_norm": 0.10844177007675171, "learning_rate": 0.000433453237410072, "loss": 1.2462, "mean_token_accuracy": 0.6777553111314774, "num_tokens": 11427387.0, "step": 186 }, { "entropy": 1.318359375, "epoch": 0.6744815148782687, "grad_norm": 0.10542786121368408, "learning_rate": 0.0004330935251798561, "loss": 1.2558, "mean_token_accuracy": 0.6754523366689682, "num_tokens": 11488182.0, "step": 187 }, { "entropy": 1.27734375, "epoch": 0.6780883678990082, "grad_norm": 0.10616500675678253, "learning_rate": 0.0004327338129496403, "loss": 1.2154, "mean_token_accuracy": 0.6861670315265656, "num_tokens": 11549489.0, "step": 188 }, { "entropy": 1.248046875, "epoch": 0.6816952209197475, "grad_norm": 0.11189056187868118, "learning_rate": 0.00043237410071942446, "loss": 1.2237, "mean_token_accuracy": 0.6808458119630814, "num_tokens": 11610110.0, "step": 189 }, { "entropy": 1.236328125, "epoch": 0.6853020739404869, "grad_norm": 0.1094244197010994, "learning_rate": 0.00043201438848920867, "loss": 1.22, "mean_token_accuracy": 0.6821918785572052, "num_tokens": 11671350.0, "step": 190 }, { "entropy": 1.2265625, "epoch": 0.6889089269612263, "grad_norm": 0.11222892999649048, "learning_rate": 0.00043165467625899277, "loss": 1.2306, "mean_token_accuracy": 0.6772794872522354, "num_tokens": 11732380.0, "step": 191 }, { "entropy": 1.2578125, "epoch": 0.6925157799819658, "grad_norm": 0.11168543994426727, "learning_rate": 0.000431294964028777, "loss": 1.2448, "mean_token_accuracy": 0.6753189563751221, "num_tokens": 11793281.0, "step": 192 }, { "entropy": 1.326171875, "epoch": 0.6961226330027052, "grad_norm": 0.10435453057289124, "learning_rate": 0.00043093525179856113, "loss": 1.2567, "mean_token_accuracy": 0.67137511074543, "num_tokens": 11855084.0, "step": 193 }, { "entropy": 1.3125, "epoch": 0.6997294860234445, "grad_norm": 0.11164921522140503, "learning_rate": 0.00043057553956834534, "loss": 1.2333, "mean_token_accuracy": 0.6790268570184708, "num_tokens": 11916518.0, "step": 194 }, { "entropy": 1.30078125, "epoch": 0.7033363390441839, "grad_norm": 0.1051032766699791, "learning_rate": 0.00043021582733812955, "loss": 1.2717, "mean_token_accuracy": 0.6721814721822739, "num_tokens": 11976480.0, "step": 195 }, { "entropy": 1.28125, "epoch": 0.7069431920649234, "grad_norm": 0.1090792790055275, "learning_rate": 0.00042985611510791365, "loss": 1.2631, "mean_token_accuracy": 0.6706659644842148, "num_tokens": 12038333.0, "step": 196 }, { "entropy": 1.23828125, "epoch": 0.7105500450856628, "grad_norm": 0.11211685091257095, "learning_rate": 0.00042949640287769786, "loss": 1.2318, "mean_token_accuracy": 0.6776062697172165, "num_tokens": 12099366.0, "step": 197 }, { "entropy": 1.2578125, "epoch": 0.7141568981064021, "grad_norm": 0.10964176803827286, "learning_rate": 0.000429136690647482, "loss": 1.2436, "mean_token_accuracy": 0.6773542612791061, "num_tokens": 12159264.0, "step": 198 }, { "entropy": 1.296875, "epoch": 0.7177637511271415, "grad_norm": 0.12643007934093475, "learning_rate": 0.0004287769784172662, "loss": 1.2454, "mean_token_accuracy": 0.6743884831666946, "num_tokens": 12221497.0, "step": 199 }, { "entropy": 1.2578125, "epoch": 0.721370604147881, "grad_norm": 0.10725845396518707, "learning_rate": 0.0004284172661870503, "loss": 1.215, "mean_token_accuracy": 0.6816920936107635, "num_tokens": 12281306.0, "step": 200 }, { "entropy": 1.248046875, "epoch": 0.7249774571686204, "grad_norm": 0.10579068958759308, "learning_rate": 0.00042805755395683453, "loss": 1.2133, "mean_token_accuracy": 0.6807700097560883, "num_tokens": 12342293.0, "step": 201 }, { "entropy": 1.26953125, "epoch": 0.7285843101893598, "grad_norm": 0.11594922095537186, "learning_rate": 0.00042769784172661874, "loss": 1.2628, "mean_token_accuracy": 0.6729366332292557, "num_tokens": 12403686.0, "step": 202 }, { "entropy": 1.265625, "epoch": 0.7321911632100991, "grad_norm": 0.10861968249082565, "learning_rate": 0.0004273381294964029, "loss": 1.2322, "mean_token_accuracy": 0.6770869344472885, "num_tokens": 12466237.0, "step": 203 }, { "entropy": 1.3203125, "epoch": 0.7357980162308386, "grad_norm": 0.10953235626220703, "learning_rate": 0.0004269784172661871, "loss": 1.2711, "mean_token_accuracy": 0.6650150716304779, "num_tokens": 12524658.0, "step": 204 }, { "entropy": 1.26171875, "epoch": 0.739404869251578, "grad_norm": 0.11514332890510559, "learning_rate": 0.0004266187050359712, "loss": 1.21, "mean_token_accuracy": 0.6824623197317123, "num_tokens": 12586817.0, "step": 205 }, { "entropy": 1.25390625, "epoch": 0.7430117222723174, "grad_norm": 0.10787420719861984, "learning_rate": 0.0004262589928057554, "loss": 1.2339, "mean_token_accuracy": 0.676334947347641, "num_tokens": 12649259.0, "step": 206 }, { "entropy": 1.25390625, "epoch": 0.7466185752930569, "grad_norm": 0.1065104752779007, "learning_rate": 0.00042589928057553956, "loss": 1.2196, "mean_token_accuracy": 0.6816098242998123, "num_tokens": 12711376.0, "step": 207 }, { "entropy": 1.244140625, "epoch": 0.7502254283137962, "grad_norm": 0.10898435115814209, "learning_rate": 0.00042553956834532377, "loss": 1.2039, "mean_token_accuracy": 0.6833729594945908, "num_tokens": 12773138.0, "step": 208 }, { "entropy": 1.30078125, "epoch": 0.7538322813345356, "grad_norm": 0.11270162463188171, "learning_rate": 0.00042517985611510787, "loss": 1.2691, "mean_token_accuracy": 0.673949122428894, "num_tokens": 12836347.0, "step": 209 }, { "entropy": 1.27734375, "epoch": 0.757439134355275, "grad_norm": 0.10765185952186584, "learning_rate": 0.0004248201438848921, "loss": 1.2378, "mean_token_accuracy": 0.6757946312427521, "num_tokens": 12897745.0, "step": 210 }, { "entropy": 1.240234375, "epoch": 0.7610459873760145, "grad_norm": 0.10766444355249405, "learning_rate": 0.0004244604316546763, "loss": 1.2035, "mean_token_accuracy": 0.6856610029935837, "num_tokens": 12959108.0, "step": 211 }, { "entropy": 1.236328125, "epoch": 0.7646528403967539, "grad_norm": 0.10983278602361679, "learning_rate": 0.00042410071942446044, "loss": 1.2105, "mean_token_accuracy": 0.6832093745470047, "num_tokens": 13019142.0, "step": 212 }, { "entropy": 1.2109375, "epoch": 0.7682596934174932, "grad_norm": 0.11807787418365479, "learning_rate": 0.00042374100719424465, "loss": 1.2084, "mean_token_accuracy": 0.6817581653594971, "num_tokens": 13079375.0, "step": 213 }, { "entropy": 1.228515625, "epoch": 0.7718665464382326, "grad_norm": 0.10261593014001846, "learning_rate": 0.00042338129496402875, "loss": 1.211, "mean_token_accuracy": 0.6827448457479477, "num_tokens": 13140109.0, "step": 214 }, { "entropy": 1.306640625, "epoch": 0.7754733994589721, "grad_norm": 0.11636381596326828, "learning_rate": 0.00042302158273381296, "loss": 1.266, "mean_token_accuracy": 0.6709317713975906, "num_tokens": 13201053.0, "step": 215 }, { "entropy": 1.3203125, "epoch": 0.7790802524797115, "grad_norm": 0.1099143996834755, "learning_rate": 0.0004226618705035971, "loss": 1.262, "mean_token_accuracy": 0.6734907180070877, "num_tokens": 13262964.0, "step": 216 }, { "entropy": 1.25390625, "epoch": 0.7826871055004508, "grad_norm": 0.11104336380958557, "learning_rate": 0.0004223021582733813, "loss": 1.1933, "mean_token_accuracy": 0.683800145983696, "num_tokens": 13324702.0, "step": 217 }, { "entropy": 1.26953125, "epoch": 0.7862939585211902, "grad_norm": 0.11280883848667145, "learning_rate": 0.0004219424460431655, "loss": 1.2438, "mean_token_accuracy": 0.6764675676822662, "num_tokens": 13386378.0, "step": 218 }, { "entropy": 1.244140625, "epoch": 0.7899008115419297, "grad_norm": 0.10563644021749496, "learning_rate": 0.00042158273381294963, "loss": 1.2219, "mean_token_accuracy": 0.6784726232290268, "num_tokens": 13447874.0, "step": 219 }, { "entropy": 1.17578125, "epoch": 0.7935076645626691, "grad_norm": 0.11429610103368759, "learning_rate": 0.00042122302158273384, "loss": 1.1652, "mean_token_accuracy": 0.6897690445184708, "num_tokens": 13509068.0, "step": 220 }, { "entropy": 1.23046875, "epoch": 0.7971145175834085, "grad_norm": 0.11500820517539978, "learning_rate": 0.000420863309352518, "loss": 1.2057, "mean_token_accuracy": 0.6824591606855392, "num_tokens": 13569750.0, "step": 221 }, { "entropy": 1.236328125, "epoch": 0.8007213706041478, "grad_norm": 0.11324937641620636, "learning_rate": 0.0004205035971223022, "loss": 1.2152, "mean_token_accuracy": 0.6826305240392685, "num_tokens": 13632126.0, "step": 222 }, { "entropy": 1.234375, "epoch": 0.8043282236248873, "grad_norm": 0.11634687334299088, "learning_rate": 0.0004201438848920863, "loss": 1.1925, "mean_token_accuracy": 0.6828910708427429, "num_tokens": 13694188.0, "step": 223 }, { "entropy": 1.298828125, "epoch": 0.8079350766456267, "grad_norm": 0.1130850538611412, "learning_rate": 0.0004197841726618705, "loss": 1.2318, "mean_token_accuracy": 0.6769719272851944, "num_tokens": 13755039.0, "step": 224 }, { "entropy": 1.28515625, "epoch": 0.8115419296663661, "grad_norm": 0.11476517468690872, "learning_rate": 0.00041942446043165466, "loss": 1.232, "mean_token_accuracy": 0.6818104982376099, "num_tokens": 13816500.0, "step": 225 }, { "entropy": 1.267578125, "epoch": 0.8151487826871056, "grad_norm": 0.11381266266107559, "learning_rate": 0.00041906474820143887, "loss": 1.2356, "mean_token_accuracy": 0.6754339188337326, "num_tokens": 13876599.0, "step": 226 }, { "entropy": 1.185546875, "epoch": 0.8187556357078449, "grad_norm": 0.12813608348369598, "learning_rate": 0.000418705035971223, "loss": 1.1976, "mean_token_accuracy": 0.684471607208252, "num_tokens": 13938464.0, "step": 227 }, { "entropy": 1.208984375, "epoch": 0.8223624887285843, "grad_norm": 0.11226394027471542, "learning_rate": 0.0004183453237410072, "loss": 1.2117, "mean_token_accuracy": 0.6837659180164337, "num_tokens": 14001459.0, "step": 228 }, { "entropy": 1.263671875, "epoch": 0.8259693417493237, "grad_norm": 0.10709498822689056, "learning_rate": 0.0004179856115107914, "loss": 1.2257, "mean_token_accuracy": 0.6789108663797379, "num_tokens": 14064697.0, "step": 229 }, { "entropy": 1.29296875, "epoch": 0.8295761947700632, "grad_norm": 0.11449526995420456, "learning_rate": 0.00041762589928057554, "loss": 1.2334, "mean_token_accuracy": 0.6803536713123322, "num_tokens": 14126126.0, "step": 230 }, { "entropy": 1.3125, "epoch": 0.8331830477908025, "grad_norm": 0.1105601042509079, "learning_rate": 0.00041726618705035975, "loss": 1.2453, "mean_token_accuracy": 0.6745648831129074, "num_tokens": 14186104.0, "step": 231 }, { "entropy": 1.240234375, "epoch": 0.8367899008115419, "grad_norm": 0.10903415828943253, "learning_rate": 0.00041690647482014385, "loss": 1.2126, "mean_token_accuracy": 0.6840721815824509, "num_tokens": 14248453.0, "step": 232 }, { "entropy": 1.236328125, "epoch": 0.8403967538322813, "grad_norm": 0.1116284653544426, "learning_rate": 0.00041654676258992806, "loss": 1.2169, "mean_token_accuracy": 0.6794636696577072, "num_tokens": 14309644.0, "step": 233 }, { "entropy": 1.2421875, "epoch": 0.8440036068530208, "grad_norm": 0.1086789146065712, "learning_rate": 0.0004161870503597122, "loss": 1.2262, "mean_token_accuracy": 0.6806915700435638, "num_tokens": 14370724.0, "step": 234 }, { "entropy": 1.234375, "epoch": 0.8476104598737602, "grad_norm": 0.10979616641998291, "learning_rate": 0.0004158273381294964, "loss": 1.2087, "mean_token_accuracy": 0.6803796291351318, "num_tokens": 14431600.0, "step": 235 }, { "entropy": 1.265625, "epoch": 0.8512173128944995, "grad_norm": 0.11608376353979111, "learning_rate": 0.00041546762589928063, "loss": 1.2242, "mean_token_accuracy": 0.677669420838356, "num_tokens": 14493993.0, "step": 236 }, { "entropy": 1.255859375, "epoch": 0.8548241659152389, "grad_norm": 0.10973092168569565, "learning_rate": 0.00041510791366906473, "loss": 1.2224, "mean_token_accuracy": 0.680708721280098, "num_tokens": 14555648.0, "step": 237 }, { "entropy": 1.287109375, "epoch": 0.8584310189359784, "grad_norm": 0.11958243697881699, "learning_rate": 0.00041474820143884894, "loss": 1.2291, "mean_token_accuracy": 0.6781415045261383, "num_tokens": 14616099.0, "step": 238 }, { "entropy": 1.265625, "epoch": 0.8620378719567178, "grad_norm": 0.1115846112370491, "learning_rate": 0.0004143884892086331, "loss": 1.2211, "mean_token_accuracy": 0.6795002520084381, "num_tokens": 14677084.0, "step": 239 }, { "entropy": 1.287109375, "epoch": 0.8656447249774571, "grad_norm": 0.11387167125940323, "learning_rate": 0.0004140287769784173, "loss": 1.2587, "mean_token_accuracy": 0.6729397773742676, "num_tokens": 14738815.0, "step": 240 }, { "entropy": 1.23046875, "epoch": 0.8692515779981965, "grad_norm": 0.11470820009708405, "learning_rate": 0.00041366906474820146, "loss": 1.2107, "mean_token_accuracy": 0.6803193092346191, "num_tokens": 14798880.0, "step": 241 }, { "entropy": 1.22265625, "epoch": 0.872858431018936, "grad_norm": 0.11122211068868637, "learning_rate": 0.0004133093525179856, "loss": 1.221, "mean_token_accuracy": 0.6819735169410706, "num_tokens": 14860893.0, "step": 242 }, { "entropy": 1.234375, "epoch": 0.8764652840396754, "grad_norm": 0.11543437838554382, "learning_rate": 0.00041294964028776976, "loss": 1.2361, "mean_token_accuracy": 0.6749699413776398, "num_tokens": 14921105.0, "step": 243 }, { "entropy": 1.26171875, "epoch": 0.8800721370604148, "grad_norm": 0.10818582773208618, "learning_rate": 0.00041258992805755397, "loss": 1.2045, "mean_token_accuracy": 0.6836854219436646, "num_tokens": 14982068.0, "step": 244 }, { "entropy": 1.322265625, "epoch": 0.8836789900811542, "grad_norm": 0.11246918141841888, "learning_rate": 0.0004122302158273382, "loss": 1.2764, "mean_token_accuracy": 0.6691164523363113, "num_tokens": 15039831.0, "step": 245 }, { "entropy": 1.306640625, "epoch": 0.8872858431018936, "grad_norm": 0.11163143068552017, "learning_rate": 0.0004118705035971223, "loss": 1.2476, "mean_token_accuracy": 0.6756374686956406, "num_tokens": 15101849.0, "step": 246 }, { "entropy": 1.279296875, "epoch": 0.890892696122633, "grad_norm": 0.10882314294576645, "learning_rate": 0.0004115107913669065, "loss": 1.2254, "mean_token_accuracy": 0.6769590675830841, "num_tokens": 15164375.0, "step": 247 }, { "entropy": 1.234375, "epoch": 0.8944995491433724, "grad_norm": 0.11400745809078217, "learning_rate": 0.00041115107913669064, "loss": 1.2075, "mean_token_accuracy": 0.6808744519948959, "num_tokens": 15224258.0, "step": 248 }, { "entropy": 1.205078125, "epoch": 0.8981064021641119, "grad_norm": 0.11705787479877472, "learning_rate": 0.00041079136690647485, "loss": 1.2019, "mean_token_accuracy": 0.6836320012807846, "num_tokens": 15285538.0, "step": 249 }, { "entropy": 1.169921875, "epoch": 0.9017132551848512, "grad_norm": 0.11530110985040665, "learning_rate": 0.000410431654676259, "loss": 1.1355, "mean_token_accuracy": 0.6963971108198166, "num_tokens": 15346677.0, "step": 250 }, { "entropy": 1.25390625, "epoch": 0.9053201082055906, "grad_norm": 0.12153225392103195, "learning_rate": 0.00041007194244604316, "loss": 1.2214, "mean_token_accuracy": 0.6810008138418198, "num_tokens": 15406726.0, "step": 251 }, { "entropy": 1.24609375, "epoch": 0.90892696122633, "grad_norm": 0.11004765331745148, "learning_rate": 0.0004097122302158273, "loss": 1.214, "mean_token_accuracy": 0.6827515214681625, "num_tokens": 15469695.0, "step": 252 }, { "entropy": 1.265625, "epoch": 0.9125338142470695, "grad_norm": 0.1144164502620697, "learning_rate": 0.0004093525179856115, "loss": 1.2487, "mean_token_accuracy": 0.6756491959095001, "num_tokens": 15530984.0, "step": 253 }, { "entropy": 1.2890625, "epoch": 0.9161406672678089, "grad_norm": 0.11565149575471878, "learning_rate": 0.00040899280575539573, "loss": 1.2531, "mean_token_accuracy": 0.67392498254776, "num_tokens": 15591776.0, "step": 254 }, { "entropy": 1.236328125, "epoch": 0.9197475202885482, "grad_norm": 0.12919116020202637, "learning_rate": 0.00040863309352517983, "loss": 1.1831, "mean_token_accuracy": 0.6865502595901489, "num_tokens": 15652441.0, "step": 255 }, { "entropy": 1.1953125, "epoch": 0.9233543733092876, "grad_norm": 0.11027511209249496, "learning_rate": 0.00040827338129496404, "loss": 1.1613, "mean_token_accuracy": 0.6939192414283752, "num_tokens": 15714054.0, "step": 256 }, { "entropy": 1.263671875, "epoch": 0.9269612263300271, "grad_norm": 0.11752895265817642, "learning_rate": 0.0004079136690647482, "loss": 1.2095, "mean_token_accuracy": 0.6831164509057999, "num_tokens": 15774639.0, "step": 257 }, { "entropy": 1.271484375, "epoch": 0.9305680793507665, "grad_norm": 0.133804589509964, "learning_rate": 0.0004075539568345324, "loss": 1.263, "mean_token_accuracy": 0.670431062579155, "num_tokens": 15834786.0, "step": 258 }, { "entropy": 1.205078125, "epoch": 0.9341749323715058, "grad_norm": 0.10909534990787506, "learning_rate": 0.00040719424460431656, "loss": 1.1817, "mean_token_accuracy": 0.6886219680309296, "num_tokens": 15894699.0, "step": 259 }, { "entropy": 1.296875, "epoch": 0.9377817853922452, "grad_norm": 0.11968094855546951, "learning_rate": 0.0004068345323741007, "loss": 1.2664, "mean_token_accuracy": 0.6717170029878616, "num_tokens": 15956334.0, "step": 260 }, { "entropy": 1.259765625, "epoch": 0.9413886384129847, "grad_norm": 0.10848213732242584, "learning_rate": 0.0004064748201438849, "loss": 1.2184, "mean_token_accuracy": 0.6822212189435959, "num_tokens": 16018310.0, "step": 261 }, { "entropy": 1.23046875, "epoch": 0.9449954914337241, "grad_norm": 0.11788785457611084, "learning_rate": 0.0004061151079136691, "loss": 1.2057, "mean_token_accuracy": 0.6808347553014755, "num_tokens": 16080252.0, "step": 262 }, { "entropy": 1.265625, "epoch": 0.9486023444544635, "grad_norm": 0.1199130117893219, "learning_rate": 0.0004057553956834533, "loss": 1.2326, "mean_token_accuracy": 0.6789408773183823, "num_tokens": 16142021.0, "step": 263 }, { "entropy": 1.310546875, "epoch": 0.9522091974752029, "grad_norm": 0.1160711795091629, "learning_rate": 0.0004053956834532374, "loss": 1.2741, "mean_token_accuracy": 0.6713822036981583, "num_tokens": 16204373.0, "step": 264 }, { "entropy": 1.267578125, "epoch": 0.9558160504959423, "grad_norm": 0.12317435443401337, "learning_rate": 0.0004050359712230216, "loss": 1.2079, "mean_token_accuracy": 0.6817189157009125, "num_tokens": 16266051.0, "step": 265 }, { "entropy": 1.248046875, "epoch": 0.9594229035166817, "grad_norm": 0.11181028187274933, "learning_rate": 0.00040467625899280574, "loss": 1.2195, "mean_token_accuracy": 0.6788323819637299, "num_tokens": 16329322.0, "step": 266 }, { "entropy": 1.28125, "epoch": 0.9630297565374211, "grad_norm": 0.11690454930067062, "learning_rate": 0.00040431654676258995, "loss": 1.2403, "mean_token_accuracy": 0.6766374111175537, "num_tokens": 16390423.0, "step": 267 }, { "entropy": 1.20703125, "epoch": 0.9666366095581606, "grad_norm": 0.11905571073293686, "learning_rate": 0.0004039568345323741, "loss": 1.1798, "mean_token_accuracy": 0.6859436184167862, "num_tokens": 16452315.0, "step": 268 }, { "entropy": 1.27734375, "epoch": 0.9702434625788999, "grad_norm": 0.1156335324048996, "learning_rate": 0.00040359712230215826, "loss": 1.2302, "mean_token_accuracy": 0.6785985678434372, "num_tokens": 16514132.0, "step": 269 }, { "entropy": 1.2265625, "epoch": 0.9738503155996393, "grad_norm": 0.1110672652721405, "learning_rate": 0.00040323741007194247, "loss": 1.2015, "mean_token_accuracy": 0.6827525794506073, "num_tokens": 16575551.0, "step": 270 }, { "entropy": 1.25, "epoch": 0.9774571686203787, "grad_norm": 0.11445683985948563, "learning_rate": 0.0004028776978417266, "loss": 1.2198, "mean_token_accuracy": 0.6802763640880585, "num_tokens": 16636277.0, "step": 271 }, { "entropy": 1.228515625, "epoch": 0.9810640216411182, "grad_norm": 0.12164060026407242, "learning_rate": 0.00040251798561151083, "loss": 1.1906, "mean_token_accuracy": 0.6867797821760178, "num_tokens": 16696444.0, "step": 272 }, { "entropy": 1.23046875, "epoch": 0.9846708746618575, "grad_norm": 0.11492655426263809, "learning_rate": 0.000402158273381295, "loss": 1.2197, "mean_token_accuracy": 0.678614154458046, "num_tokens": 16757422.0, "step": 273 }, { "entropy": 1.2578125, "epoch": 0.9882777276825969, "grad_norm": 0.11276474595069885, "learning_rate": 0.00040179856115107914, "loss": 1.2233, "mean_token_accuracy": 0.6801506578922272, "num_tokens": 16818954.0, "step": 274 }, { "entropy": 1.240234375, "epoch": 0.9918845807033363, "grad_norm": 0.12744681537151337, "learning_rate": 0.0004014388489208633, "loss": 1.1931, "mean_token_accuracy": 0.6845046877861023, "num_tokens": 16880837.0, "step": 275 }, { "entropy": 1.2421875, "epoch": 0.9954914337240758, "grad_norm": 0.11264511197805405, "learning_rate": 0.0004010791366906475, "loss": 1.1869, "mean_token_accuracy": 0.6857382655143738, "num_tokens": 16942103.0, "step": 276 }, { "entropy": 1.263671875, "epoch": 0.9990982867448152, "grad_norm": 0.11393030732870102, "learning_rate": 0.00040071942446043166, "loss": 1.2302, "mean_token_accuracy": 0.6790373772382736, "num_tokens": 17003606.0, "step": 277 }, { "entropy": 1.296875, "epoch": 1.0, "grad_norm": 0.21479536592960358, "learning_rate": 0.0004003597122302158, "loss": 1.258, "mean_token_accuracy": 0.6718791723251343, "num_tokens": 17018574.0, "step": 278 }, { "epoch": 1.0, "eval_entropy": 1.24609375, "eval_loss": 1.222013235092163, "eval_mean_token_accuracy": 0.680073082447052, "eval_num_tokens": 17018574.0, "eval_runtime": 2.2803, "eval_samples_per_second": 21.927, "eval_steps_per_second": 0.877, "step": 278 }, { "entropy": 1.18359375, "epoch": 1.0036068530207394, "grad_norm": 0.1532185971736908, "learning_rate": 0.0004, "loss": 1.0444, "mean_token_accuracy": 0.7175655514001846, "num_tokens": 17080058.0, "step": 279 }, { "entropy": 1.109375, "epoch": 1.0072137060414788, "grad_norm": 0.13461436331272125, "learning_rate": 0.0003996402877697842, "loss": 1.0214, "mean_token_accuracy": 0.7220525145530701, "num_tokens": 17141370.0, "step": 280 }, { "entropy": 1.09765625, "epoch": 1.0108205590622181, "grad_norm": 0.13366983830928802, "learning_rate": 0.0003992805755395684, "loss": 1.1044, "mean_token_accuracy": 0.7030452191829681, "num_tokens": 17203442.0, "step": 281 }, { "entropy": 1.021484375, "epoch": 1.0144274120829577, "grad_norm": 0.14508597552776337, "learning_rate": 0.00039892086330935254, "loss": 1.054, "mean_token_accuracy": 0.7117158323526382, "num_tokens": 17265161.0, "step": 282 }, { "entropy": 1.05859375, "epoch": 1.018034265103697, "grad_norm": 0.13830584287643433, "learning_rate": 0.0003985611510791367, "loss": 1.056, "mean_token_accuracy": 0.7121240347623825, "num_tokens": 17326857.0, "step": 283 }, { "entropy": 1.115234375, "epoch": 1.0216411181244365, "grad_norm": 0.1405598670244217, "learning_rate": 0.00039820143884892084, "loss": 1.059, "mean_token_accuracy": 0.7117359787225723, "num_tokens": 17389450.0, "step": 284 }, { "entropy": 1.1640625, "epoch": 1.0252479711451759, "grad_norm": 0.13219821453094482, "learning_rate": 0.00039784172661870505, "loss": 1.0604, "mean_token_accuracy": 0.7123303711414337, "num_tokens": 17451889.0, "step": 285 }, { "entropy": 1.134765625, "epoch": 1.0288548241659152, "grad_norm": 0.13288705050945282, "learning_rate": 0.00039748201438848926, "loss": 1.0338, "mean_token_accuracy": 0.7192254811525345, "num_tokens": 17513408.0, "step": 286 }, { "entropy": 1.1484375, "epoch": 1.0324616771866546, "grad_norm": 0.12357257306575775, "learning_rate": 0.00039712230215827336, "loss": 1.0544, "mean_token_accuracy": 0.7149926871061325, "num_tokens": 17573755.0, "step": 287 }, { "entropy": 1.07421875, "epoch": 1.036068530207394, "grad_norm": 0.12823380529880524, "learning_rate": 0.00039676258992805757, "loss": 1.0214, "mean_token_accuracy": 0.7222512066364288, "num_tokens": 17633393.0, "step": 288 }, { "entropy": 1.048828125, "epoch": 1.0396753832281334, "grad_norm": 0.1331387162208557, "learning_rate": 0.0003964028776978417, "loss": 1.0591, "mean_token_accuracy": 0.715098649263382, "num_tokens": 17694040.0, "step": 289 }, { "entropy": 1.048828125, "epoch": 1.043282236248873, "grad_norm": 0.13274770975112915, "learning_rate": 0.00039604316546762593, "loss": 1.0554, "mean_token_accuracy": 0.7123450040817261, "num_tokens": 17757518.0, "step": 290 }, { "entropy": 1.048828125, "epoch": 1.0468890892696123, "grad_norm": 0.1252189725637436, "learning_rate": 0.0003956834532374101, "loss": 1.0201, "mean_token_accuracy": 0.7201875448226929, "num_tokens": 17818231.0, "step": 291 }, { "entropy": 1.15234375, "epoch": 1.0504959422903517, "grad_norm": 0.12560538947582245, "learning_rate": 0.00039532374100719424, "loss": 1.0928, "mean_token_accuracy": 0.7057194113731384, "num_tokens": 17879226.0, "step": 292 }, { "entropy": 1.109375, "epoch": 1.054102795311091, "grad_norm": 0.12489767372608185, "learning_rate": 0.0003949640287769784, "loss": 1.018, "mean_token_accuracy": 0.7216801941394806, "num_tokens": 17941440.0, "step": 293 }, { "entropy": 1.115234375, "epoch": 1.0577096483318305, "grad_norm": 0.12201204895973206, "learning_rate": 0.0003946043165467626, "loss": 1.0431, "mean_token_accuracy": 0.7156760543584824, "num_tokens": 18002946.0, "step": 294 }, { "entropy": 1.1015625, "epoch": 1.0613165013525698, "grad_norm": 0.12814654409885406, "learning_rate": 0.0003942446043165468, "loss": 1.0606, "mean_token_accuracy": 0.7130328416824341, "num_tokens": 18063297.0, "step": 295 }, { "entropy": 1.083984375, "epoch": 1.0649233543733092, "grad_norm": 0.13144978880882263, "learning_rate": 0.0003938848920863309, "loss": 1.0678, "mean_token_accuracy": 0.7106511294841766, "num_tokens": 18124296.0, "step": 296 }, { "entropy": 1.048828125, "epoch": 1.0685302073940486, "grad_norm": 0.13163822889328003, "learning_rate": 0.0003935251798561151, "loss": 1.0173, "mean_token_accuracy": 0.7213574945926666, "num_tokens": 18186382.0, "step": 297 }, { "entropy": 1.119140625, "epoch": 1.0721370604147882, "grad_norm": 0.12866437435150146, "learning_rate": 0.0003931654676258993, "loss": 1.0808, "mean_token_accuracy": 0.7069525271654129, "num_tokens": 18246547.0, "step": 298 }, { "entropy": 1.171875, "epoch": 1.0757439134355276, "grad_norm": 0.13252496719360352, "learning_rate": 0.0003928057553956835, "loss": 1.1153, "mean_token_accuracy": 0.7028367072343826, "num_tokens": 18307174.0, "step": 299 }, { "entropy": 1.07421875, "epoch": 1.079350766456267, "grad_norm": 0.12969689071178436, "learning_rate": 0.00039244604316546764, "loss": 0.9923, "mean_token_accuracy": 0.7260416895151138, "num_tokens": 18369378.0, "step": 300 }, { "entropy": 1.09765625, "epoch": 1.0829576194770063, "grad_norm": 0.13675308227539062, "learning_rate": 0.0003920863309352518, "loss": 1.058, "mean_token_accuracy": 0.7147640734910965, "num_tokens": 18429876.0, "step": 301 }, { "entropy": 1.107421875, "epoch": 1.0865644724977457, "grad_norm": 0.1312839537858963, "learning_rate": 0.00039172661870503594, "loss": 1.0628, "mean_token_accuracy": 0.7115120738744736, "num_tokens": 18492231.0, "step": 302 }, { "entropy": 1.056640625, "epoch": 1.090171325518485, "grad_norm": 0.13687828183174133, "learning_rate": 0.00039136690647482015, "loss": 1.0376, "mean_token_accuracy": 0.7186303436756134, "num_tokens": 18552863.0, "step": 303 }, { "entropy": 1.0625, "epoch": 1.0937781785392244, "grad_norm": 0.1349339634180069, "learning_rate": 0.00039100719424460436, "loss": 1.0388, "mean_token_accuracy": 0.7190583050251007, "num_tokens": 18613684.0, "step": 304 }, { "entropy": 1.078125, "epoch": 1.097385031559964, "grad_norm": 0.12771771848201752, "learning_rate": 0.0003906474820143885, "loss": 1.0439, "mean_token_accuracy": 0.7149292528629303, "num_tokens": 18674094.0, "step": 305 }, { "entropy": 1.109375, "epoch": 1.1009918845807034, "grad_norm": 0.1299605816602707, "learning_rate": 0.00039028776978417267, "loss": 1.0691, "mean_token_accuracy": 0.7104043364524841, "num_tokens": 18735093.0, "step": 306 }, { "entropy": 1.13671875, "epoch": 1.1045987376014428, "grad_norm": 0.12710529565811157, "learning_rate": 0.0003899280575539568, "loss": 1.0665, "mean_token_accuracy": 0.7118852436542511, "num_tokens": 18796809.0, "step": 307 }, { "entropy": 1.12109375, "epoch": 1.1082055906221822, "grad_norm": 0.1284399777650833, "learning_rate": 0.00038956834532374103, "loss": 1.0543, "mean_token_accuracy": 0.7137065082788467, "num_tokens": 18858082.0, "step": 308 }, { "entropy": 1.06640625, "epoch": 1.1118124436429215, "grad_norm": 0.13292910158634186, "learning_rate": 0.0003892086330935252, "loss": 1.0211, "mean_token_accuracy": 0.7181867063045502, "num_tokens": 18919405.0, "step": 309 }, { "entropy": 1.09375, "epoch": 1.115419296663661, "grad_norm": 0.13276222348213196, "learning_rate": 0.00038884892086330934, "loss": 1.0502, "mean_token_accuracy": 0.7141633629798889, "num_tokens": 18980941.0, "step": 310 }, { "entropy": 1.076171875, "epoch": 1.1190261496844003, "grad_norm": 0.1342204213142395, "learning_rate": 0.00038848920863309355, "loss": 1.0463, "mean_token_accuracy": 0.7141983807086945, "num_tokens": 19041196.0, "step": 311 }, { "entropy": 1.0634765625, "epoch": 1.1226330027051397, "grad_norm": 0.13001392781734467, "learning_rate": 0.0003881294964028777, "loss": 1.0311, "mean_token_accuracy": 0.7191209346055984, "num_tokens": 19102840.0, "step": 312 }, { "entropy": 1.119140625, "epoch": 1.1262398557258793, "grad_norm": 0.1417953073978424, "learning_rate": 0.0003877697841726619, "loss": 1.093, "mean_token_accuracy": 0.7082613408565521, "num_tokens": 19164357.0, "step": 313 }, { "entropy": 1.087890625, "epoch": 1.1298467087466186, "grad_norm": 0.12892118096351624, "learning_rate": 0.00038741007194244607, "loss": 1.0359, "mean_token_accuracy": 0.7175996154546738, "num_tokens": 19225823.0, "step": 314 }, { "entropy": 1.134765625, "epoch": 1.133453561767358, "grad_norm": 0.13197879493236542, "learning_rate": 0.0003870503597122302, "loss": 1.0758, "mean_token_accuracy": 0.7074074298143387, "num_tokens": 19286909.0, "step": 315 }, { "entropy": 1.12890625, "epoch": 1.1370604147880974, "grad_norm": 0.13496033847332, "learning_rate": 0.0003866906474820144, "loss": 1.0909, "mean_token_accuracy": 0.7053187936544418, "num_tokens": 19348268.0, "step": 316 }, { "entropy": 1.13671875, "epoch": 1.1406672678088368, "grad_norm": 0.14152295887470245, "learning_rate": 0.0003863309352517986, "loss": 1.0863, "mean_token_accuracy": 0.7082232385873795, "num_tokens": 19407099.0, "step": 317 }, { "entropy": 1.103515625, "epoch": 1.1442741208295761, "grad_norm": 0.13126792013645172, "learning_rate": 0.00038597122302158274, "loss": 1.0693, "mean_token_accuracy": 0.7098374366760254, "num_tokens": 19469416.0, "step": 318 }, { "entropy": 1.107421875, "epoch": 1.1478809738503155, "grad_norm": 0.13841383159160614, "learning_rate": 0.0003856115107913669, "loss": 1.0826, "mean_token_accuracy": 0.7098283916711807, "num_tokens": 19530900.0, "step": 319 }, { "entropy": 1.099609375, "epoch": 1.151487826871055, "grad_norm": 0.13077609241008759, "learning_rate": 0.0003852517985611511, "loss": 1.0366, "mean_token_accuracy": 0.7194613218307495, "num_tokens": 19592675.0, "step": 320 }, { "entropy": 1.095703125, "epoch": 1.1550946798917945, "grad_norm": 0.13558422029018402, "learning_rate": 0.00038489208633093525, "loss": 1.0539, "mean_token_accuracy": 0.7119215577840805, "num_tokens": 19653286.0, "step": 321 }, { "entropy": 1.1171875, "epoch": 1.1587015329125339, "grad_norm": 0.13994644582271576, "learning_rate": 0.00038453237410071946, "loss": 1.0801, "mean_token_accuracy": 0.7100097835063934, "num_tokens": 19716316.0, "step": 322 }, { "entropy": 1.072265625, "epoch": 1.1623083859332732, "grad_norm": 0.14376996457576752, "learning_rate": 0.0003841726618705036, "loss": 1.0261, "mean_token_accuracy": 0.7179982513189316, "num_tokens": 19777173.0, "step": 323 }, { "entropy": 1.1015625, "epoch": 1.1659152389540126, "grad_norm": 0.13048367202281952, "learning_rate": 0.00038381294964028777, "loss": 1.0697, "mean_token_accuracy": 0.7109661847352982, "num_tokens": 19837738.0, "step": 324 }, { "entropy": 1.11328125, "epoch": 1.169522091974752, "grad_norm": 0.13549086451530457, "learning_rate": 0.0003834532374100719, "loss": 1.0545, "mean_token_accuracy": 0.7117505967617035, "num_tokens": 19900268.0, "step": 325 }, { "entropy": 1.095703125, "epoch": 1.1731289449954914, "grad_norm": 0.13677242398262024, "learning_rate": 0.00038309352517985613, "loss": 1.0393, "mean_token_accuracy": 0.7148695737123489, "num_tokens": 19961777.0, "step": 326 }, { "entropy": 1.064453125, "epoch": 1.1767357980162307, "grad_norm": 0.1307229846715927, "learning_rate": 0.0003827338129496403, "loss": 1.0322, "mean_token_accuracy": 0.7173839658498764, "num_tokens": 20023529.0, "step": 327 }, { "entropy": 1.087890625, "epoch": 1.1803426510369703, "grad_norm": 0.12977871298789978, "learning_rate": 0.00038237410071942444, "loss": 1.0448, "mean_token_accuracy": 0.7170901298522949, "num_tokens": 20085864.0, "step": 328 }, { "entropy": 1.052734375, "epoch": 1.1839495040577097, "grad_norm": 0.1369037628173828, "learning_rate": 0.00038201438848920865, "loss": 1.0423, "mean_token_accuracy": 0.7154880315065384, "num_tokens": 20148205.0, "step": 329 }, { "entropy": 1.087890625, "epoch": 1.187556357078449, "grad_norm": 0.13064616918563843, "learning_rate": 0.0003816546762589928, "loss": 1.0572, "mean_token_accuracy": 0.7141537070274353, "num_tokens": 20210563.0, "step": 330 }, { "entropy": 1.11328125, "epoch": 1.1911632100991885, "grad_norm": 0.13562823832035065, "learning_rate": 0.000381294964028777, "loss": 1.0569, "mean_token_accuracy": 0.7124348729848862, "num_tokens": 20270605.0, "step": 331 }, { "entropy": 1.12890625, "epoch": 1.1947700631199278, "grad_norm": 0.1408068984746933, "learning_rate": 0.00038093525179856117, "loss": 1.0776, "mean_token_accuracy": 0.708257406949997, "num_tokens": 20332584.0, "step": 332 }, { "entropy": 1.107421875, "epoch": 1.1983769161406672, "grad_norm": 0.1361820548772812, "learning_rate": 0.0003805755395683453, "loss": 1.0382, "mean_token_accuracy": 0.7154301851987839, "num_tokens": 20393358.0, "step": 333 }, { "entropy": 1.095703125, "epoch": 1.2019837691614066, "grad_norm": 0.14277297258377075, "learning_rate": 0.0003802158273381295, "loss": 1.0629, "mean_token_accuracy": 0.7089251279830933, "num_tokens": 20453039.0, "step": 334 }, { "entropy": 1.099609375, "epoch": 1.2055906221821462, "grad_norm": 0.15519964694976807, "learning_rate": 0.0003798561151079137, "loss": 1.0799, "mean_token_accuracy": 0.7059762328863144, "num_tokens": 20514602.0, "step": 335 }, { "entropy": 1.0703125, "epoch": 1.2091974752028856, "grad_norm": 0.13580408692359924, "learning_rate": 0.0003794964028776979, "loss": 1.021, "mean_token_accuracy": 0.7171784937381744, "num_tokens": 20576129.0, "step": 336 }, { "entropy": 1.080078125, "epoch": 1.212804328223625, "grad_norm": 0.1347741037607193, "learning_rate": 0.00037913669064748205, "loss": 1.0334, "mean_token_accuracy": 0.7187928557395935, "num_tokens": 20637670.0, "step": 337 }, { "entropy": 1.09765625, "epoch": 1.2164111812443643, "grad_norm": 0.13330188393592834, "learning_rate": 0.0003787769784172662, "loss": 1.0599, "mean_token_accuracy": 0.7132797986268997, "num_tokens": 20699330.0, "step": 338 }, { "entropy": 1.0576171875, "epoch": 1.2200180342651037, "grad_norm": 0.13282665610313416, "learning_rate": 0.00037841726618705035, "loss": 1.0261, "mean_token_accuracy": 0.7214452624320984, "num_tokens": 20760929.0, "step": 339 }, { "entropy": 1.09375, "epoch": 1.223624887285843, "grad_norm": 0.13530196249485016, "learning_rate": 0.00037805755395683456, "loss": 1.0553, "mean_token_accuracy": 0.7116454839706421, "num_tokens": 20821032.0, "step": 340 }, { "entropy": 1.083984375, "epoch": 1.2272317403065824, "grad_norm": 0.14245475828647614, "learning_rate": 0.0003776978417266187, "loss": 1.0485, "mean_token_accuracy": 0.7152341455221176, "num_tokens": 20883332.0, "step": 341 }, { "entropy": 1.083984375, "epoch": 1.2308385933273218, "grad_norm": 0.14170053601264954, "learning_rate": 0.00037733812949640287, "loss": 1.0459, "mean_token_accuracy": 0.7162416875362396, "num_tokens": 20944016.0, "step": 342 }, { "entropy": 1.09375, "epoch": 1.2344454463480612, "grad_norm": 0.134388267993927, "learning_rate": 0.000376978417266187, "loss": 1.034, "mean_token_accuracy": 0.7166213691234589, "num_tokens": 21004680.0, "step": 343 }, { "entropy": 1.09765625, "epoch": 1.2380522993688008, "grad_norm": 0.1364385038614273, "learning_rate": 0.00037661870503597123, "loss": 1.0618, "mean_token_accuracy": 0.7135148048400879, "num_tokens": 21066450.0, "step": 344 }, { "entropy": 1.115234375, "epoch": 1.2416591523895402, "grad_norm": 0.14279454946517944, "learning_rate": 0.00037625899280575544, "loss": 1.0432, "mean_token_accuracy": 0.7155686914920807, "num_tokens": 21127408.0, "step": 345 }, { "entropy": 1.103515625, "epoch": 1.2452660054102795, "grad_norm": 0.13373027741909027, "learning_rate": 0.0003758992805755396, "loss": 1.0587, "mean_token_accuracy": 0.7126907557249069, "num_tokens": 21187776.0, "step": 346 }, { "entropy": 1.060546875, "epoch": 1.248872858431019, "grad_norm": 0.14535702764987946, "learning_rate": 0.00037553956834532375, "loss": 1.0113, "mean_token_accuracy": 0.7219382375478745, "num_tokens": 21249671.0, "step": 347 }, { "entropy": 1.103515625, "epoch": 1.2524797114517583, "grad_norm": 0.13926368951797485, "learning_rate": 0.0003751798561151079, "loss": 1.0848, "mean_token_accuracy": 0.7057438641786575, "num_tokens": 21313028.0, "step": 348 }, { "entropy": 1.076171875, "epoch": 1.2560865644724977, "grad_norm": 0.1494593769311905, "learning_rate": 0.0003748201438848921, "loss": 1.0241, "mean_token_accuracy": 0.7201995104551315, "num_tokens": 21374907.0, "step": 349 }, { "entropy": 1.107421875, "epoch": 1.2596934174932373, "grad_norm": 0.14162597060203552, "learning_rate": 0.00037446043165467627, "loss": 1.0742, "mean_token_accuracy": 0.7102047652006149, "num_tokens": 21434319.0, "step": 350 }, { "entropy": 1.05078125, "epoch": 1.2633002705139766, "grad_norm": 0.13195401430130005, "learning_rate": 0.0003741007194244604, "loss": 1.0087, "mean_token_accuracy": 0.7232490479946136, "num_tokens": 21496949.0, "step": 351 }, { "entropy": 1.08984375, "epoch": 1.266907123534716, "grad_norm": 0.13702073693275452, "learning_rate": 0.0003737410071942446, "loss": 1.0296, "mean_token_accuracy": 0.7174291461706161, "num_tokens": 21559158.0, "step": 352 }, { "entropy": 1.09765625, "epoch": 1.2705139765554554, "grad_norm": 0.1377280056476593, "learning_rate": 0.0003733812949640288, "loss": 1.0485, "mean_token_accuracy": 0.7144969552755356, "num_tokens": 21620529.0, "step": 353 }, { "entropy": 1.05859375, "epoch": 1.2741208295761948, "grad_norm": 0.13685151934623718, "learning_rate": 0.000373021582733813, "loss": 1.0194, "mean_token_accuracy": 0.7217678278684616, "num_tokens": 21680767.0, "step": 354 }, { "entropy": 1.052734375, "epoch": 1.2777276825969341, "grad_norm": 0.15255708992481232, "learning_rate": 0.00037266187050359715, "loss": 1.0435, "mean_token_accuracy": 0.7146172672510147, "num_tokens": 21741788.0, "step": 355 }, { "entropy": 1.06640625, "epoch": 1.2813345356176735, "grad_norm": 0.14156097173690796, "learning_rate": 0.0003723021582733813, "loss": 1.0114, "mean_token_accuracy": 0.724108412861824, "num_tokens": 21802112.0, "step": 356 }, { "entropy": 1.11328125, "epoch": 1.284941388638413, "grad_norm": 0.1385280042886734, "learning_rate": 0.00037194244604316545, "loss": 1.0529, "mean_token_accuracy": 0.7141352593898773, "num_tokens": 21864383.0, "step": 357 }, { "entropy": 1.1015625, "epoch": 1.2885482416591523, "grad_norm": 0.13795801997184753, "learning_rate": 0.00037158273381294966, "loss": 1.0451, "mean_token_accuracy": 0.7142595201730728, "num_tokens": 21926531.0, "step": 358 }, { "entropy": 1.076171875, "epoch": 1.2921550946798919, "grad_norm": 0.14140096306800842, "learning_rate": 0.0003712230215827338, "loss": 1.0295, "mean_token_accuracy": 0.7174100130796432, "num_tokens": 21988893.0, "step": 359 }, { "entropy": 1.103515625, "epoch": 1.2957619477006312, "grad_norm": 0.13924801349639893, "learning_rate": 0.00037086330935251797, "loss": 1.0608, "mean_token_accuracy": 0.7133963704109192, "num_tokens": 22050063.0, "step": 360 }, { "entropy": 1.080078125, "epoch": 1.2993688007213706, "grad_norm": 0.14152970910072327, "learning_rate": 0.0003705035971223021, "loss": 1.0508, "mean_token_accuracy": 0.7126090675592422, "num_tokens": 22112232.0, "step": 361 }, { "entropy": 1.064453125, "epoch": 1.30297565374211, "grad_norm": 0.14483888447284698, "learning_rate": 0.00037014388489208633, "loss": 1.0514, "mean_token_accuracy": 0.713681161403656, "num_tokens": 22173577.0, "step": 362 }, { "entropy": 1.095703125, "epoch": 1.3065825067628494, "grad_norm": 0.1392778605222702, "learning_rate": 0.00036978417266187054, "loss": 1.0405, "mean_token_accuracy": 0.7189803421497345, "num_tokens": 22237214.0, "step": 363 }, { "entropy": 1.10546875, "epoch": 1.3101893597835887, "grad_norm": 0.13870221376419067, "learning_rate": 0.0003694244604316547, "loss": 1.0357, "mean_token_accuracy": 0.7185314446687698, "num_tokens": 22298656.0, "step": 364 }, { "entropy": 1.09375, "epoch": 1.3137962128043283, "grad_norm": 0.13803422451019287, "learning_rate": 0.00036906474820143885, "loss": 1.0594, "mean_token_accuracy": 0.7141570746898651, "num_tokens": 22361252.0, "step": 365 }, { "entropy": 1.078125, "epoch": 1.3174030658250677, "grad_norm": 0.13701270520687103, "learning_rate": 0.000368705035971223, "loss": 1.0265, "mean_token_accuracy": 0.7180781811475754, "num_tokens": 22422959.0, "step": 366 }, { "entropy": 1.09765625, "epoch": 1.321009918845807, "grad_norm": 0.13325785100460052, "learning_rate": 0.0003683453237410072, "loss": 1.0497, "mean_token_accuracy": 0.7151871174573898, "num_tokens": 22485833.0, "step": 367 }, { "entropy": 1.076171875, "epoch": 1.3246167718665465, "grad_norm": 0.14257453382015228, "learning_rate": 0.00036798561151079137, "loss": 1.0624, "mean_token_accuracy": 0.710911214351654, "num_tokens": 22547829.0, "step": 368 }, { "entropy": 1.08984375, "epoch": 1.3282236248872858, "grad_norm": 0.14454475045204163, "learning_rate": 0.0003676258992805756, "loss": 1.0572, "mean_token_accuracy": 0.7106860876083374, "num_tokens": 22607873.0, "step": 369 }, { "entropy": 1.103515625, "epoch": 1.3318304779080252, "grad_norm": 0.14354878664016724, "learning_rate": 0.00036726618705035973, "loss": 1.0284, "mean_token_accuracy": 0.7208631485700607, "num_tokens": 22666647.0, "step": 370 }, { "entropy": 1.087890625, "epoch": 1.3354373309287646, "grad_norm": 0.13689127564430237, "learning_rate": 0.0003669064748201439, "loss": 1.0156, "mean_token_accuracy": 0.7236041277647018, "num_tokens": 22729353.0, "step": 371 }, { "entropy": 1.041015625, "epoch": 1.339044183949504, "grad_norm": 0.13225047290325165, "learning_rate": 0.0003665467625899281, "loss": 0.9818, "mean_token_accuracy": 0.728752002120018, "num_tokens": 22792213.0, "step": 372 }, { "entropy": 1.02734375, "epoch": 1.3426510369702434, "grad_norm": 0.14980781078338623, "learning_rate": 0.00036618705035971225, "loss": 1.0356, "mean_token_accuracy": 0.71525439620018, "num_tokens": 22854295.0, "step": 373 }, { "entropy": 1.033203125, "epoch": 1.346257889990983, "grad_norm": 0.14062675833702087, "learning_rate": 0.0003658273381294964, "loss": 1.0101, "mean_token_accuracy": 0.7271976321935654, "num_tokens": 22916215.0, "step": 374 }, { "entropy": 1.068359375, "epoch": 1.3498647430117223, "grad_norm": 0.14666685461997986, "learning_rate": 0.00036546762589928055, "loss": 1.0487, "mean_token_accuracy": 0.7150315195322037, "num_tokens": 22978105.0, "step": 375 }, { "entropy": 1.080078125, "epoch": 1.3534715960324617, "grad_norm": 0.14054550230503082, "learning_rate": 0.00036510791366906476, "loss": 1.0057, "mean_token_accuracy": 0.7271197736263275, "num_tokens": 23039579.0, "step": 376 }, { "entropy": 1.10546875, "epoch": 1.357078449053201, "grad_norm": 0.1432265341281891, "learning_rate": 0.0003647482014388489, "loss": 1.0245, "mean_token_accuracy": 0.7176426947116852, "num_tokens": 23101477.0, "step": 377 }, { "entropy": 1.068359375, "epoch": 1.3606853020739405, "grad_norm": 0.14140461385250092, "learning_rate": 0.0003643884892086331, "loss": 1.0236, "mean_token_accuracy": 0.7217485308647156, "num_tokens": 23163390.0, "step": 378 }, { "entropy": 1.087890625, "epoch": 1.3642921550946798, "grad_norm": 0.15188847482204437, "learning_rate": 0.0003640287769784173, "loss": 1.076, "mean_token_accuracy": 0.7099576890468597, "num_tokens": 23224191.0, "step": 379 }, { "entropy": 1.06640625, "epoch": 1.3678990081154194, "grad_norm": 0.14789770543575287, "learning_rate": 0.00036366906474820143, "loss": 1.0499, "mean_token_accuracy": 0.7172891199588776, "num_tokens": 23284818.0, "step": 380 }, { "entropy": 1.07421875, "epoch": 1.3715058611361588, "grad_norm": 0.14499987661838531, "learning_rate": 0.00036330935251798564, "loss": 1.0362, "mean_token_accuracy": 0.7168740779161453, "num_tokens": 23344701.0, "step": 381 }, { "entropy": 1.083984375, "epoch": 1.3751127141568982, "grad_norm": 0.13747680187225342, "learning_rate": 0.0003629496402877698, "loss": 1.0236, "mean_token_accuracy": 0.7177060544490814, "num_tokens": 23407046.0, "step": 382 }, { "entropy": 1.064453125, "epoch": 1.3787195671776376, "grad_norm": 0.13344493508338928, "learning_rate": 0.00036258992805755395, "loss": 1.0016, "mean_token_accuracy": 0.7250716537237167, "num_tokens": 23469763.0, "step": 383 }, { "entropy": 1.076171875, "epoch": 1.382326420198377, "grad_norm": 0.14250785112380981, "learning_rate": 0.0003622302158273381, "loss": 1.0294, "mean_token_accuracy": 0.7230207622051239, "num_tokens": 23530313.0, "step": 384 }, { "entropy": 1.0625, "epoch": 1.3859332732191163, "grad_norm": 0.14248697459697723, "learning_rate": 0.0003618705035971223, "loss": 1.0471, "mean_token_accuracy": 0.7146794497966766, "num_tokens": 23590131.0, "step": 385 }, { "entropy": 1.029296875, "epoch": 1.3895401262398557, "grad_norm": 0.14045003056526184, "learning_rate": 0.00036151079136690647, "loss": 0.9955, "mean_token_accuracy": 0.7246028184890747, "num_tokens": 23651400.0, "step": 386 }, { "entropy": 1.08203125, "epoch": 1.393146979260595, "grad_norm": 0.139520525932312, "learning_rate": 0.0003611510791366907, "loss": 1.0368, "mean_token_accuracy": 0.7194759249687195, "num_tokens": 23715071.0, "step": 387 }, { "entropy": 1.080078125, "epoch": 1.3967538322813344, "grad_norm": 0.13825471699237823, "learning_rate": 0.00036079136690647483, "loss": 1.0395, "mean_token_accuracy": 0.7189320176839828, "num_tokens": 23776894.0, "step": 388 }, { "entropy": 1.109375, "epoch": 1.4003606853020738, "grad_norm": 0.14453333616256714, "learning_rate": 0.000360431654676259, "loss": 1.0439, "mean_token_accuracy": 0.7163714915513992, "num_tokens": 23837747.0, "step": 389 }, { "entropy": 1.072265625, "epoch": 1.4039675383228134, "grad_norm": 0.14099536836147308, "learning_rate": 0.0003600719424460432, "loss": 1.0154, "mean_token_accuracy": 0.7236050516366959, "num_tokens": 23900024.0, "step": 390 }, { "entropy": 1.091796875, "epoch": 1.4075743913435528, "grad_norm": 0.14300987124443054, "learning_rate": 0.00035971223021582735, "loss": 1.0701, "mean_token_accuracy": 0.7131118923425674, "num_tokens": 23962263.0, "step": 391 }, { "entropy": 1.076171875, "epoch": 1.4111812443642922, "grad_norm": 0.14010730385780334, "learning_rate": 0.0003593525179856115, "loss": 1.0269, "mean_token_accuracy": 0.7197435051202774, "num_tokens": 24024453.0, "step": 392 }, { "entropy": 1.10546875, "epoch": 1.4147880973850315, "grad_norm": 0.13952654600143433, "learning_rate": 0.00035899280575539565, "loss": 1.0686, "mean_token_accuracy": 0.7110422700643539, "num_tokens": 24087432.0, "step": 393 }, { "entropy": 1.123046875, "epoch": 1.418394950405771, "grad_norm": 0.1550634205341339, "learning_rate": 0.00035863309352517986, "loss": 1.053, "mean_token_accuracy": 0.7152791023254395, "num_tokens": 24148784.0, "step": 394 }, { "entropy": 1.029296875, "epoch": 1.4220018034265105, "grad_norm": 0.14504283666610718, "learning_rate": 0.00035827338129496407, "loss": 1.0025, "mean_token_accuracy": 0.7257438600063324, "num_tokens": 24209892.0, "step": 395 }, { "entropy": 1.044921875, "epoch": 1.4256086564472499, "grad_norm": 0.1429341733455658, "learning_rate": 0.0003579136690647482, "loss": 1.0312, "mean_token_accuracy": 0.7204969972372055, "num_tokens": 24271744.0, "step": 396 }, { "entropy": 1.052734375, "epoch": 1.4292155094679893, "grad_norm": 0.1442926824092865, "learning_rate": 0.0003575539568345324, "loss": 1.0111, "mean_token_accuracy": 0.7220601737499237, "num_tokens": 24331713.0, "step": 397 }, { "entropy": 1.072265625, "epoch": 1.4328223624887286, "grad_norm": 0.14605337381362915, "learning_rate": 0.00035719424460431653, "loss": 1.0232, "mean_token_accuracy": 0.718538910150528, "num_tokens": 24394046.0, "step": 398 }, { "entropy": 1.09375, "epoch": 1.436429215509468, "grad_norm": 0.13914726674556732, "learning_rate": 0.00035683453237410074, "loss": 1.0331, "mean_token_accuracy": 0.7184961587190628, "num_tokens": 24456988.0, "step": 399 }, { "entropy": 1.06640625, "epoch": 1.4400360685302074, "grad_norm": 0.14562831819057465, "learning_rate": 0.0003564748201438849, "loss": 1.0117, "mean_token_accuracy": 0.7247395217418671, "num_tokens": 24517726.0, "step": 400 }, { "entropy": 1.03515625, "epoch": 1.4436429215509468, "grad_norm": 0.1414245367050171, "learning_rate": 0.0003561151079136691, "loss": 1.014, "mean_token_accuracy": 0.720547154545784, "num_tokens": 24578343.0, "step": 401 }, { "entropy": 1.0703125, "epoch": 1.4472497745716861, "grad_norm": 0.14571057260036469, "learning_rate": 0.0003557553956834532, "loss": 1.0272, "mean_token_accuracy": 0.7198485881090164, "num_tokens": 24639554.0, "step": 402 }, { "entropy": 1.072265625, "epoch": 1.4508566275924255, "grad_norm": 0.14170527458190918, "learning_rate": 0.0003553956834532374, "loss": 1.0084, "mean_token_accuracy": 0.7224139720201492, "num_tokens": 24701088.0, "step": 403 }, { "entropy": 1.03515625, "epoch": 1.4544634806131649, "grad_norm": 0.13828538358211517, "learning_rate": 0.0003550359712230216, "loss": 0.996, "mean_token_accuracy": 0.726051926612854, "num_tokens": 24762518.0, "step": 404 }, { "entropy": 1.0263671875, "epoch": 1.4580703336339045, "grad_norm": 0.14839991927146912, "learning_rate": 0.0003546762589928058, "loss": 1.013, "mean_token_accuracy": 0.7219575494527817, "num_tokens": 24825476.0, "step": 405 }, { "entropy": 1.076171875, "epoch": 1.4616771866546439, "grad_norm": 0.1514674872159958, "learning_rate": 0.00035431654676258993, "loss": 1.0521, "mean_token_accuracy": 0.7126452326774597, "num_tokens": 24885764.0, "step": 406 }, { "entropy": 1.083984375, "epoch": 1.4652840396753832, "grad_norm": 0.14279510080814362, "learning_rate": 0.0003539568345323741, "loss": 1.0244, "mean_token_accuracy": 0.7193940132856369, "num_tokens": 24946791.0, "step": 407 }, { "entropy": 1.1015625, "epoch": 1.4688908926961226, "grad_norm": 0.14598055183887482, "learning_rate": 0.0003535971223021583, "loss": 1.0468, "mean_token_accuracy": 0.7168498188257217, "num_tokens": 25005869.0, "step": 408 }, { "entropy": 1.080078125, "epoch": 1.472497745716862, "grad_norm": 0.1413266509771347, "learning_rate": 0.00035323741007194245, "loss": 1.0146, "mean_token_accuracy": 0.7220647484064102, "num_tokens": 25068578.0, "step": 409 }, { "entropy": 1.08203125, "epoch": 1.4761045987376016, "grad_norm": 0.14360283315181732, "learning_rate": 0.00035287769784172665, "loss": 1.0482, "mean_token_accuracy": 0.7131220400333405, "num_tokens": 25131010.0, "step": 410 }, { "entropy": 1.0625, "epoch": 1.479711451758341, "grad_norm": 0.14262841641902924, "learning_rate": 0.00035251798561151075, "loss": 1.0367, "mean_token_accuracy": 0.7210260778665543, "num_tokens": 25192586.0, "step": 411 }, { "entropy": 1.052734375, "epoch": 1.4833183047790803, "grad_norm": 0.14640897512435913, "learning_rate": 0.00035215827338129496, "loss": 1.0334, "mean_token_accuracy": 0.7185729593038559, "num_tokens": 25255529.0, "step": 412 }, { "entropy": 1.0859375, "epoch": 1.4869251577998197, "grad_norm": 0.1473883092403412, "learning_rate": 0.00035179856115107917, "loss": 1.0208, "mean_token_accuracy": 0.7234293520450592, "num_tokens": 25316976.0, "step": 413 }, { "entropy": 1.06640625, "epoch": 1.490532010820559, "grad_norm": 0.1415351778268814, "learning_rate": 0.0003514388489208633, "loss": 1.014, "mean_token_accuracy": 0.722156822681427, "num_tokens": 25377440.0, "step": 414 }, { "entropy": 1.103515625, "epoch": 1.4941388638412985, "grad_norm": 0.15093085169792175, "learning_rate": 0.0003510791366906475, "loss": 1.069, "mean_token_accuracy": 0.7116756439208984, "num_tokens": 25439375.0, "step": 415 }, { "entropy": 1.0146484375, "epoch": 1.4977457168620378, "grad_norm": 0.14616931974887848, "learning_rate": 0.00035071942446043163, "loss": 0.9823, "mean_token_accuracy": 0.7271050214767456, "num_tokens": 25501277.0, "step": 416 }, { "entropy": 1.01953125, "epoch": 1.5013525698827772, "grad_norm": 0.1495436429977417, "learning_rate": 0.00035035971223021584, "loss": 1.0164, "mean_token_accuracy": 0.7202876806259155, "num_tokens": 25561512.0, "step": 417 }, { "entropy": 1.0185546875, "epoch": 1.5049594229035166, "grad_norm": 0.13886940479278564, "learning_rate": 0.00035, "loss": 0.9777, "mean_token_accuracy": 0.7282452881336212, "num_tokens": 25625421.0, "step": 418 }, { "entropy": 1.087890625, "epoch": 1.508566275924256, "grad_norm": 0.14599931240081787, "learning_rate": 0.0003496402877697842, "loss": 1.0254, "mean_token_accuracy": 0.7221691906452179, "num_tokens": 25687341.0, "step": 419 }, { "entropy": 1.107421875, "epoch": 1.5121731289449953, "grad_norm": 0.1464518904685974, "learning_rate": 0.00034928057553956836, "loss": 1.0334, "mean_token_accuracy": 0.7214155942201614, "num_tokens": 25748443.0, "step": 420 }, { "entropy": 1.0546875, "epoch": 1.515779981965735, "grad_norm": 0.14493343234062195, "learning_rate": 0.0003489208633093525, "loss": 1.0179, "mean_token_accuracy": 0.7221162021160126, "num_tokens": 25810132.0, "step": 421 }, { "entropy": 1.037109375, "epoch": 1.5193868349864743, "grad_norm": 0.146087646484375, "learning_rate": 0.0003485611510791367, "loss": 1.028, "mean_token_accuracy": 0.7192852944135666, "num_tokens": 25871980.0, "step": 422 }, { "entropy": 1.0234375, "epoch": 1.5229936880072137, "grad_norm": 0.14769218862056732, "learning_rate": 0.0003482014388489209, "loss": 1.0034, "mean_token_accuracy": 0.7252810001373291, "num_tokens": 25932615.0, "step": 423 }, { "entropy": 1.091796875, "epoch": 1.526600541027953, "grad_norm": 0.15204836428165436, "learning_rate": 0.00034784172661870503, "loss": 1.0332, "mean_token_accuracy": 0.7176603674888611, "num_tokens": 25993805.0, "step": 424 }, { "entropy": 1.060546875, "epoch": 1.5302073940486927, "grad_norm": 0.15242035686969757, "learning_rate": 0.0003474820143884892, "loss": 0.998, "mean_token_accuracy": 0.7264039218425751, "num_tokens": 26056580.0, "step": 425 }, { "entropy": 1.095703125, "epoch": 1.533814247069432, "grad_norm": 0.1413901001214981, "learning_rate": 0.0003471223021582734, "loss": 1.0389, "mean_token_accuracy": 0.7172961384057999, "num_tokens": 26116432.0, "step": 426 }, { "entropy": 1.0859375, "epoch": 1.5374211000901714, "grad_norm": 0.14844612777233124, "learning_rate": 0.00034676258992805755, "loss": 1.0446, "mean_token_accuracy": 0.718516156077385, "num_tokens": 26178725.0, "step": 427 }, { "entropy": 1.06640625, "epoch": 1.5410279531109108, "grad_norm": 0.1492159217596054, "learning_rate": 0.00034640287769784175, "loss": 1.0372, "mean_token_accuracy": 0.7198771238327026, "num_tokens": 26239538.0, "step": 428 }, { "entropy": 1.05078125, "epoch": 1.5446348061316502, "grad_norm": 0.14545129239559174, "learning_rate": 0.0003460431654676259, "loss": 0.9953, "mean_token_accuracy": 0.7250233143568039, "num_tokens": 26300785.0, "step": 429 }, { "entropy": 1.060546875, "epoch": 1.5482416591523895, "grad_norm": 0.14596784114837646, "learning_rate": 0.00034568345323741006, "loss": 1.0297, "mean_token_accuracy": 0.7165802121162415, "num_tokens": 26361955.0, "step": 430 }, { "entropy": 1.0693359375, "epoch": 1.551848512173129, "grad_norm": 0.15327775478363037, "learning_rate": 0.00034532374100719427, "loss": 1.0371, "mean_token_accuracy": 0.7195271998643875, "num_tokens": 26421819.0, "step": 431 }, { "entropy": 1.08203125, "epoch": 1.5554553651938683, "grad_norm": 0.14710205793380737, "learning_rate": 0.0003449640287769784, "loss": 1.0628, "mean_token_accuracy": 0.7138154059648514, "num_tokens": 26483901.0, "step": 432 }, { "entropy": 1.11328125, "epoch": 1.5590622182146077, "grad_norm": 0.15460380911827087, "learning_rate": 0.00034460431654676263, "loss": 1.0801, "mean_token_accuracy": 0.7090575248003006, "num_tokens": 26543785.0, "step": 433 }, { "entropy": 1.111328125, "epoch": 1.562669071235347, "grad_norm": 0.14789527654647827, "learning_rate": 0.00034424460431654673, "loss": 1.062, "mean_token_accuracy": 0.7159483581781387, "num_tokens": 26604039.0, "step": 434 }, { "entropy": 1.04296875, "epoch": 1.5662759242560864, "grad_norm": 0.14169582724571228, "learning_rate": 0.00034388489208633094, "loss": 0.9796, "mean_token_accuracy": 0.7281365245580673, "num_tokens": 26664902.0, "step": 435 }, { "entropy": 1.0703125, "epoch": 1.569882777276826, "grad_norm": 0.14930906891822815, "learning_rate": 0.0003435251798561151, "loss": 0.9871, "mean_token_accuracy": 0.727164551615715, "num_tokens": 26725574.0, "step": 436 }, { "entropy": 1.048828125, "epoch": 1.5734896302975654, "grad_norm": 0.1507052779197693, "learning_rate": 0.0003431654676258993, "loss": 1.0079, "mean_token_accuracy": 0.7247980237007141, "num_tokens": 26785448.0, "step": 437 }, { "entropy": 1.0146484375, "epoch": 1.5770964833183048, "grad_norm": 0.17338210344314575, "learning_rate": 0.00034280575539568346, "loss": 1.0209, "mean_token_accuracy": 0.7208815813064575, "num_tokens": 26847058.0, "step": 438 }, { "entropy": 1.05859375, "epoch": 1.5807033363390441, "grad_norm": 0.14492666721343994, "learning_rate": 0.0003424460431654676, "loss": 1.0117, "mean_token_accuracy": 0.7240273356437683, "num_tokens": 26908856.0, "step": 439 }, { "entropy": 1.060546875, "epoch": 1.5843101893597837, "grad_norm": 0.14985105395317078, "learning_rate": 0.0003420863309352518, "loss": 1.0175, "mean_token_accuracy": 0.721354067325592, "num_tokens": 26971168.0, "step": 440 }, { "entropy": 1.05078125, "epoch": 1.5879170423805231, "grad_norm": 0.14478665590286255, "learning_rate": 0.000341726618705036, "loss": 1.0015, "mean_token_accuracy": 0.7264650911092758, "num_tokens": 27034466.0, "step": 441 }, { "entropy": 1.046875, "epoch": 1.5915238954012625, "grad_norm": 0.14160378277301788, "learning_rate": 0.0003413669064748202, "loss": 0.9794, "mean_token_accuracy": 0.729441300034523, "num_tokens": 27095813.0, "step": 442 }, { "entropy": 1.064453125, "epoch": 1.5951307484220019, "grad_norm": 0.14732828736305237, "learning_rate": 0.0003410071942446043, "loss": 1.0041, "mean_token_accuracy": 0.7257778644561768, "num_tokens": 27159576.0, "step": 443 }, { "entropy": 1.05859375, "epoch": 1.5987376014427412, "grad_norm": 0.1545983999967575, "learning_rate": 0.0003406474820143885, "loss": 1.0298, "mean_token_accuracy": 0.7196213752031326, "num_tokens": 27222284.0, "step": 444 }, { "entropy": 1.02734375, "epoch": 1.6023444544634806, "grad_norm": 0.15445514023303986, "learning_rate": 0.0003402877697841727, "loss": 1.0036, "mean_token_accuracy": 0.7231949716806412, "num_tokens": 27281686.0, "step": 445 }, { "entropy": 1.021484375, "epoch": 1.60595130748422, "grad_norm": 0.14997966587543488, "learning_rate": 0.00033992805755395686, "loss": 0.9946, "mean_token_accuracy": 0.7252556830644608, "num_tokens": 27343482.0, "step": 446 }, { "entropy": 1.0107421875, "epoch": 1.6095581605049594, "grad_norm": 0.15030330419540405, "learning_rate": 0.000339568345323741, "loss": 0.9856, "mean_token_accuracy": 0.7281950116157532, "num_tokens": 27404456.0, "step": 447 }, { "entropy": 1.052734375, "epoch": 1.6131650135256987, "grad_norm": 0.1488293707370758, "learning_rate": 0.00033920863309352516, "loss": 1.0099, "mean_token_accuracy": 0.7225365042686462, "num_tokens": 27466315.0, "step": 448 }, { "entropy": 1.068359375, "epoch": 1.6167718665464381, "grad_norm": 0.14713239669799805, "learning_rate": 0.00033884892086330937, "loss": 1.006, "mean_token_accuracy": 0.7233161330223083, "num_tokens": 27528442.0, "step": 449 }, { "entropy": 1.07421875, "epoch": 1.6203787195671775, "grad_norm": 0.14643454551696777, "learning_rate": 0.0003384892086330935, "loss": 1.0237, "mean_token_accuracy": 0.7174806743860245, "num_tokens": 27589210.0, "step": 450 }, { "entropy": 1.095703125, "epoch": 1.623985572587917, "grad_norm": 0.14547018706798553, "learning_rate": 0.00033812949640287773, "loss": 1.0439, "mean_token_accuracy": 0.7141027897596359, "num_tokens": 27651631.0, "step": 451 }, { "entropy": 1.0390625, "epoch": 1.6275924256086565, "grad_norm": 0.1475033164024353, "learning_rate": 0.00033776978417266183, "loss": 0.9876, "mean_token_accuracy": 0.7299511432647705, "num_tokens": 27711145.0, "step": 452 }, { "entropy": 1.0390625, "epoch": 1.6311992786293958, "grad_norm": 0.14480194449424744, "learning_rate": 0.00033741007194244604, "loss": 0.9973, "mean_token_accuracy": 0.7265053242444992, "num_tokens": 27773966.0, "step": 453 }, { "entropy": 1.0419921875, "epoch": 1.6348061316501352, "grad_norm": 0.15047596395015717, "learning_rate": 0.00033705035971223025, "loss": 1.0135, "mean_token_accuracy": 0.7235039472579956, "num_tokens": 27835864.0, "step": 454 }, { "entropy": 0.984375, "epoch": 1.6384129846708748, "grad_norm": 0.15022428333759308, "learning_rate": 0.0003366906474820144, "loss": 0.9607, "mean_token_accuracy": 0.7341495305299759, "num_tokens": 27898268.0, "step": 455 }, { "entropy": 1.037109375, "epoch": 1.6420198376916142, "grad_norm": 0.1499972939491272, "learning_rate": 0.00033633093525179856, "loss": 0.9979, "mean_token_accuracy": 0.7286583036184311, "num_tokens": 27959981.0, "step": 456 }, { "entropy": 1.0283203125, "epoch": 1.6456266907123536, "grad_norm": 0.1560281664133072, "learning_rate": 0.0003359712230215827, "loss": 1.0047, "mean_token_accuracy": 0.7245890349149704, "num_tokens": 28021813.0, "step": 457 }, { "entropy": 1.083984375, "epoch": 1.649233543733093, "grad_norm": 0.14171266555786133, "learning_rate": 0.0003356115107913669, "loss": 1.0454, "mean_token_accuracy": 0.7168945521116257, "num_tokens": 28084803.0, "step": 458 }, { "entropy": 1.05859375, "epoch": 1.6528403967538323, "grad_norm": 0.1469942182302475, "learning_rate": 0.0003352517985611511, "loss": 1.0011, "mean_token_accuracy": 0.723981574177742, "num_tokens": 28144498.0, "step": 459 }, { "entropy": 1.099609375, "epoch": 1.6564472497745717, "grad_norm": 0.1773584634065628, "learning_rate": 0.0003348920863309353, "loss": 1.0331, "mean_token_accuracy": 0.7156713455915451, "num_tokens": 28206904.0, "step": 460 }, { "entropy": 1.06640625, "epoch": 1.660054102795311, "grad_norm": 0.149765282869339, "learning_rate": 0.0003345323741007194, "loss": 0.9986, "mean_token_accuracy": 0.7263842672109604, "num_tokens": 28267168.0, "step": 461 }, { "entropy": 1.021484375, "epoch": 1.6636609558160504, "grad_norm": 0.14909668266773224, "learning_rate": 0.0003341726618705036, "loss": 0.9981, "mean_token_accuracy": 0.7262162417173386, "num_tokens": 28329607.0, "step": 462 }, { "entropy": 0.9912109375, "epoch": 1.6672678088367898, "grad_norm": 0.1545613408088684, "learning_rate": 0.0003338129496402878, "loss": 0.9832, "mean_token_accuracy": 0.729884922504425, "num_tokens": 28390169.0, "step": 463 }, { "entropy": 1.041015625, "epoch": 1.6708746618575292, "grad_norm": 0.15671759843826294, "learning_rate": 0.00033345323741007196, "loss": 1.0274, "mean_token_accuracy": 0.7213834822177887, "num_tokens": 28451866.0, "step": 464 }, { "entropy": 1.056640625, "epoch": 1.6744815148782686, "grad_norm": 0.15568043291568756, "learning_rate": 0.00033309352517985616, "loss": 1.0332, "mean_token_accuracy": 0.7182566225528717, "num_tokens": 28513790.0, "step": 465 }, { "entropy": 1.060546875, "epoch": 1.6780883678990082, "grad_norm": 0.14972512423992157, "learning_rate": 0.00033273381294964026, "loss": 0.9996, "mean_token_accuracy": 0.7245620936155319, "num_tokens": 28573351.0, "step": 466 }, { "entropy": 1.083984375, "epoch": 1.6816952209197475, "grad_norm": 0.1556672602891922, "learning_rate": 0.00033237410071942447, "loss": 1.0102, "mean_token_accuracy": 0.7251972705125809, "num_tokens": 28634465.0, "step": 467 }, { "entropy": 1.072265625, "epoch": 1.685302073940487, "grad_norm": 0.15847858786582947, "learning_rate": 0.0003320143884892086, "loss": 0.9889, "mean_token_accuracy": 0.7271071076393127, "num_tokens": 28697478.0, "step": 468 }, { "entropy": 1.072265625, "epoch": 1.6889089269612263, "grad_norm": 0.14419831335544586, "learning_rate": 0.00033165467625899283, "loss": 1.0325, "mean_token_accuracy": 0.7188725173473358, "num_tokens": 28759444.0, "step": 469 }, { "entropy": 1.0390625, "epoch": 1.692515779981966, "grad_norm": 0.15550972521305084, "learning_rate": 0.000331294964028777, "loss": 1.0237, "mean_token_accuracy": 0.7192263007164001, "num_tokens": 28819983.0, "step": 470 }, { "entropy": 1.03125, "epoch": 1.6961226330027053, "grad_norm": 0.15977877378463745, "learning_rate": 0.00033093525179856114, "loss": 1.0337, "mean_token_accuracy": 0.7194025963544846, "num_tokens": 28880199.0, "step": 471 }, { "entropy": 1.03515625, "epoch": 1.6997294860234446, "grad_norm": 0.15044797956943512, "learning_rate": 0.00033057553956834535, "loss": 1.0056, "mean_token_accuracy": 0.7232903242111206, "num_tokens": 28940202.0, "step": 472 }, { "entropy": 1.05859375, "epoch": 1.703336339044184, "grad_norm": 0.1577519178390503, "learning_rate": 0.0003302158273381295, "loss": 1.0, "mean_token_accuracy": 0.7250019162893295, "num_tokens": 29000507.0, "step": 473 }, { "entropy": 1.060546875, "epoch": 1.7069431920649234, "grad_norm": 0.15702258050441742, "learning_rate": 0.0003298561151079137, "loss": 0.9896, "mean_token_accuracy": 0.7290529906749725, "num_tokens": 29061497.0, "step": 474 }, { "entropy": 1.072265625, "epoch": 1.7105500450856628, "grad_norm": 0.15056470036506653, "learning_rate": 0.0003294964028776978, "loss": 1.0113, "mean_token_accuracy": 0.7224866896867752, "num_tokens": 29122880.0, "step": 475 }, { "entropy": 1.072265625, "epoch": 1.7141568981064021, "grad_norm": 0.15013793110847473, "learning_rate": 0.000329136690647482, "loss": 1.0371, "mean_token_accuracy": 0.7168307155370712, "num_tokens": 29184461.0, "step": 476 }, { "entropy": 1.048828125, "epoch": 1.7177637511271415, "grad_norm": 0.1463106870651245, "learning_rate": 0.0003287769784172662, "loss": 1.0201, "mean_token_accuracy": 0.7238983511924744, "num_tokens": 29245290.0, "step": 477 }, { "entropy": 1.072265625, "epoch": 1.721370604147881, "grad_norm": 0.154975026845932, "learning_rate": 0.0003284172661870504, "loss": 1.0273, "mean_token_accuracy": 0.7242570668458939, "num_tokens": 29305431.0, "step": 478 }, { "entropy": 1.046875, "epoch": 1.7249774571686203, "grad_norm": 0.15593965351581573, "learning_rate": 0.00032805755395683454, "loss": 1.001, "mean_token_accuracy": 0.7263014614582062, "num_tokens": 29368183.0, "step": 479 }, { "entropy": 1.07421875, "epoch": 1.7285843101893597, "grad_norm": 0.14988736808300018, "learning_rate": 0.0003276978417266187, "loss": 1.0289, "mean_token_accuracy": 0.7171696722507477, "num_tokens": 29430312.0, "step": 480 }, { "entropy": 1.06640625, "epoch": 1.732191163210099, "grad_norm": 0.15312054753303528, "learning_rate": 0.0003273381294964029, "loss": 1.0357, "mean_token_accuracy": 0.7174646705389023, "num_tokens": 29490306.0, "step": 481 }, { "entropy": 1.07421875, "epoch": 1.7357980162308386, "grad_norm": 0.15016838908195496, "learning_rate": 0.00032697841726618706, "loss": 1.0327, "mean_token_accuracy": 0.7145741730928421, "num_tokens": 29553663.0, "step": 482 }, { "entropy": 1.08203125, "epoch": 1.739404869251578, "grad_norm": 0.1526738703250885, "learning_rate": 0.00032661870503597126, "loss": 1.0038, "mean_token_accuracy": 0.7260808497667313, "num_tokens": 29615233.0, "step": 483 }, { "entropy": 1.0546875, "epoch": 1.7430117222723174, "grad_norm": 0.1557675451040268, "learning_rate": 0.00032625899280575536, "loss": 1.0143, "mean_token_accuracy": 0.7236200720071793, "num_tokens": 29677060.0, "step": 484 }, { "entropy": 1.009765625, "epoch": 1.746618575293057, "grad_norm": 0.15074867010116577, "learning_rate": 0.00032589928057553957, "loss": 0.9827, "mean_token_accuracy": 0.7308168709278107, "num_tokens": 29739515.0, "step": 485 }, { "entropy": 1.021484375, "epoch": 1.7502254283137963, "grad_norm": 0.15037979185581207, "learning_rate": 0.0003255395683453237, "loss": 0.9791, "mean_token_accuracy": 0.7306340932846069, "num_tokens": 29802468.0, "step": 486 }, { "entropy": 1.068359375, "epoch": 1.7538322813345357, "grad_norm": 0.15834461152553558, "learning_rate": 0.00032517985611510794, "loss": 1.0465, "mean_token_accuracy": 0.7158769220113754, "num_tokens": 29864076.0, "step": 487 }, { "entropy": 0.998046875, "epoch": 1.757439134355275, "grad_norm": 0.14991000294685364, "learning_rate": 0.00032482014388489214, "loss": 0.9611, "mean_token_accuracy": 0.7346139699220657, "num_tokens": 29927040.0, "step": 488 }, { "entropy": 1.07421875, "epoch": 1.7610459873760145, "grad_norm": 0.15665501356124878, "learning_rate": 0.00032446043165467624, "loss": 1.0146, "mean_token_accuracy": 0.7216772437095642, "num_tokens": 29986375.0, "step": 489 }, { "entropy": 1.083984375, "epoch": 1.7646528403967539, "grad_norm": 0.15105393528938293, "learning_rate": 0.00032410071942446045, "loss": 1.012, "mean_token_accuracy": 0.7204885482788086, "num_tokens": 30046667.0, "step": 490 }, { "entropy": 1.0234375, "epoch": 1.7682596934174932, "grad_norm": 0.1505628228187561, "learning_rate": 0.0003237410071942446, "loss": 0.9895, "mean_token_accuracy": 0.7257976531982422, "num_tokens": 30107481.0, "step": 491 }, { "entropy": 1.0263671875, "epoch": 1.7718665464382326, "grad_norm": 0.15280817449092865, "learning_rate": 0.0003233812949640288, "loss": 1.0079, "mean_token_accuracy": 0.7251942753791809, "num_tokens": 30168607.0, "step": 492 }, { "entropy": 1.005859375, "epoch": 1.775473399458972, "grad_norm": 0.15127186477184296, "learning_rate": 0.0003230215827338129, "loss": 0.9843, "mean_token_accuracy": 0.7280483096837997, "num_tokens": 30229227.0, "step": 493 }, { "entropy": 1.0703125, "epoch": 1.7790802524797114, "grad_norm": 0.15486064553260803, "learning_rate": 0.0003226618705035971, "loss": 1.0291, "mean_token_accuracy": 0.7196219563484192, "num_tokens": 30288770.0, "step": 494 }, { "entropy": 1.037109375, "epoch": 1.7826871055004507, "grad_norm": 0.14594468474388123, "learning_rate": 0.0003223021582733813, "loss": 0.9746, "mean_token_accuracy": 0.7308825999498367, "num_tokens": 30351335.0, "step": 495 }, { "entropy": 1.0751953125, "epoch": 1.78629395852119, "grad_norm": 0.151129812002182, "learning_rate": 0.0003219424460431655, "loss": 1.0325, "mean_token_accuracy": 0.7196879088878632, "num_tokens": 30413556.0, "step": 496 }, { "entropy": 1.0244140625, "epoch": 1.7899008115419297, "grad_norm": 0.15493570268154144, "learning_rate": 0.0003215827338129497, "loss": 0.9622, "mean_token_accuracy": 0.7339152842760086, "num_tokens": 30472866.0, "step": 497 }, { "entropy": 1.0625, "epoch": 1.793507664562669, "grad_norm": 0.15077629685401917, "learning_rate": 0.0003212230215827338, "loss": 1.0124, "mean_token_accuracy": 0.7240016460418701, "num_tokens": 30534225.0, "step": 498 }, { "entropy": 1.025390625, "epoch": 1.7971145175834085, "grad_norm": 0.15343870222568512, "learning_rate": 0.000320863309352518, "loss": 1.0048, "mean_token_accuracy": 0.724514052271843, "num_tokens": 30596657.0, "step": 499 }, { "entropy": 1.033203125, "epoch": 1.8007213706041478, "grad_norm": 0.14608000218868256, "learning_rate": 0.00032050359712230216, "loss": 0.9896, "mean_token_accuracy": 0.7272821217775345, "num_tokens": 30658361.0, "step": 500 }, { "entropy": 1.076171875, "epoch": 1.8043282236248874, "grad_norm": 0.15284915268421173, "learning_rate": 0.00032014388489208636, "loss": 1.033, "mean_token_accuracy": 0.7182740718126297, "num_tokens": 30718967.0, "step": 501 }, { "entropy": 1.068359375, "epoch": 1.8079350766456268, "grad_norm": 0.15503990650177002, "learning_rate": 0.00031978417266187046, "loss": 1.0112, "mean_token_accuracy": 0.7236196249723434, "num_tokens": 30778650.0, "step": 502 }, { "entropy": 1.0283203125, "epoch": 1.8115419296663662, "grad_norm": 0.14641845226287842, "learning_rate": 0.00031942446043165467, "loss": 0.9725, "mean_token_accuracy": 0.7331618964672089, "num_tokens": 30840082.0, "step": 503 }, { "entropy": 1.029296875, "epoch": 1.8151487826871056, "grad_norm": 0.1478937417268753, "learning_rate": 0.0003190647482014389, "loss": 0.9792, "mean_token_accuracy": 0.7282337099313736, "num_tokens": 30902259.0, "step": 504 }, { "entropy": 1.025390625, "epoch": 1.818755635707845, "grad_norm": 0.1649439036846161, "learning_rate": 0.00031870503597122304, "loss": 1.0025, "mean_token_accuracy": 0.725044384598732, "num_tokens": 30961651.0, "step": 505 }, { "entropy": 1.0234375, "epoch": 1.8223624887285843, "grad_norm": 0.15083837509155273, "learning_rate": 0.00031834532374100724, "loss": 0.982, "mean_token_accuracy": 0.7291984707117081, "num_tokens": 31022740.0, "step": 506 }, { "entropy": 1.0078125, "epoch": 1.8259693417493237, "grad_norm": 0.147655189037323, "learning_rate": 0.00031798561151079134, "loss": 0.9559, "mean_token_accuracy": 0.7338670045137405, "num_tokens": 31084744.0, "step": 507 }, { "entropy": 1.046875, "epoch": 1.829576194770063, "grad_norm": 0.14834074676036835, "learning_rate": 0.00031762589928057555, "loss": 0.9914, "mean_token_accuracy": 0.7292513102293015, "num_tokens": 31147466.0, "step": 508 }, { "entropy": 1.033203125, "epoch": 1.8331830477908024, "grad_norm": 0.1548537313938141, "learning_rate": 0.0003172661870503597, "loss": 0.9706, "mean_token_accuracy": 0.7325601130723953, "num_tokens": 31209070.0, "step": 509 }, { "entropy": 1.021484375, "epoch": 1.8367899008115418, "grad_norm": 0.1538155972957611, "learning_rate": 0.0003169064748201439, "loss": 0.9825, "mean_token_accuracy": 0.7312624007463455, "num_tokens": 31270586.0, "step": 510 }, { "entropy": 1.015625, "epoch": 1.8403967538322812, "grad_norm": 0.16332310438156128, "learning_rate": 0.000316546762589928, "loss": 0.9946, "mean_token_accuracy": 0.7287354916334152, "num_tokens": 31332180.0, "step": 511 }, { "entropy": 1.033203125, "epoch": 1.8440036068530208, "grad_norm": 0.15778550505638123, "learning_rate": 0.0003161870503597122, "loss": 1.0025, "mean_token_accuracy": 0.724731057882309, "num_tokens": 31393888.0, "step": 512 }, { "entropy": 1.03515625, "epoch": 1.8476104598737602, "grad_norm": 0.14917460083961487, "learning_rate": 0.00031582733812949643, "loss": 0.9897, "mean_token_accuracy": 0.7308174073696136, "num_tokens": 31456228.0, "step": 513 }, { "entropy": 1.048828125, "epoch": 1.8512173128944995, "grad_norm": 0.1536726951599121, "learning_rate": 0.0003154676258992806, "loss": 0.9875, "mean_token_accuracy": 0.7280675172805786, "num_tokens": 31519039.0, "step": 514 }, { "entropy": 1.060546875, "epoch": 1.854824165915239, "grad_norm": 0.16171295940876007, "learning_rate": 0.0003151079136690648, "loss": 0.9957, "mean_token_accuracy": 0.7269751727581024, "num_tokens": 31578912.0, "step": 515 }, { "entropy": 1.064453125, "epoch": 1.8584310189359785, "grad_norm": 0.1561453938484192, "learning_rate": 0.0003147482014388489, "loss": 1.0239, "mean_token_accuracy": 0.7213425785303116, "num_tokens": 31639219.0, "step": 516 }, { "entropy": 1.029296875, "epoch": 1.8620378719567179, "grad_norm": 0.1626780927181244, "learning_rate": 0.0003143884892086331, "loss": 0.995, "mean_token_accuracy": 0.7300314903259277, "num_tokens": 31700917.0, "step": 517 }, { "entropy": 1.048828125, "epoch": 1.8656447249774573, "grad_norm": 0.15688258409500122, "learning_rate": 0.00031402877697841726, "loss": 1.0031, "mean_token_accuracy": 0.7264000773429871, "num_tokens": 31763998.0, "step": 518 }, { "entropy": 0.9970703125, "epoch": 1.8692515779981966, "grad_norm": 0.15749165415763855, "learning_rate": 0.00031366906474820146, "loss": 0.9665, "mean_token_accuracy": 0.7338707745075226, "num_tokens": 31824155.0, "step": 519 }, { "entropy": 1.03515625, "epoch": 1.872858431018936, "grad_norm": 0.15830573439598083, "learning_rate": 0.00031330935251798556, "loss": 0.9977, "mean_token_accuracy": 0.7245229929685593, "num_tokens": 31884958.0, "step": 520 }, { "entropy": 1.044921875, "epoch": 1.8764652840396754, "grad_norm": 0.15258941054344177, "learning_rate": 0.0003129496402877698, "loss": 0.9876, "mean_token_accuracy": 0.731175497174263, "num_tokens": 31946385.0, "step": 521 }, { "entropy": 1.083984375, "epoch": 1.8800721370604148, "grad_norm": 0.16587425768375397, "learning_rate": 0.000312589928057554, "loss": 1.0207, "mean_token_accuracy": 0.7211079597473145, "num_tokens": 32006498.0, "step": 522 }, { "entropy": 1.05078125, "epoch": 1.8836789900811541, "grad_norm": 0.16131804883480072, "learning_rate": 0.00031223021582733814, "loss": 0.9842, "mean_token_accuracy": 0.7312700748443604, "num_tokens": 32066051.0, "step": 523 }, { "entropy": 1.046875, "epoch": 1.8872858431018935, "grad_norm": 0.16072450578212738, "learning_rate": 0.00031187050359712234, "loss": 0.9782, "mean_token_accuracy": 0.7319037318229675, "num_tokens": 32128249.0, "step": 524 }, { "entropy": 1.01171875, "epoch": 1.8908926961226329, "grad_norm": 0.17480826377868652, "learning_rate": 0.00031151079136690644, "loss": 1.0284, "mean_token_accuracy": 0.7197085320949554, "num_tokens": 32190044.0, "step": 525 }, { "entropy": 1.025390625, "epoch": 1.8944995491433723, "grad_norm": 0.1599782407283783, "learning_rate": 0.00031115107913669065, "loss": 0.9987, "mean_token_accuracy": 0.7273357808589935, "num_tokens": 32252452.0, "step": 526 }, { "entropy": 1.0205078125, "epoch": 1.8981064021641119, "grad_norm": 0.15095263719558716, "learning_rate": 0.0003107913669064748, "loss": 0.9834, "mean_token_accuracy": 0.7292700409889221, "num_tokens": 32314807.0, "step": 527 }, { "entropy": 1.0859375, "epoch": 1.9017132551848512, "grad_norm": 0.16473476588726044, "learning_rate": 0.000310431654676259, "loss": 1.0248, "mean_token_accuracy": 0.71968974173069, "num_tokens": 32373130.0, "step": 528 }, { "entropy": 1.0703125, "epoch": 1.9053201082055906, "grad_norm": 0.1563694328069687, "learning_rate": 0.0003100719424460432, "loss": 0.9952, "mean_token_accuracy": 0.7293795347213745, "num_tokens": 32435378.0, "step": 529 }, { "entropy": 1.056640625, "epoch": 1.90892696122633, "grad_norm": 0.1606423258781433, "learning_rate": 0.0003097122302158273, "loss": 0.9799, "mean_token_accuracy": 0.7328815460205078, "num_tokens": 32495743.0, "step": 530 }, { "entropy": 1.01953125, "epoch": 1.9125338142470696, "grad_norm": 0.15694768726825714, "learning_rate": 0.00030935251798561153, "loss": 0.9758, "mean_token_accuracy": 0.7317400127649307, "num_tokens": 32557134.0, "step": 531 }, { "entropy": 1.00390625, "epoch": 1.916140667267809, "grad_norm": 0.15672150254249573, "learning_rate": 0.0003089928057553957, "loss": 0.9821, "mean_token_accuracy": 0.7295788079500198, "num_tokens": 32619926.0, "step": 532 }, { "entropy": 0.9931640625, "epoch": 1.9197475202885483, "grad_norm": 0.15747086703777313, "learning_rate": 0.0003086330935251799, "loss": 0.9798, "mean_token_accuracy": 0.7320208996534348, "num_tokens": 32682278.0, "step": 533 }, { "entropy": 0.9892578125, "epoch": 1.9233543733092877, "grad_norm": 0.15500201284885406, "learning_rate": 0.000308273381294964, "loss": 0.9669, "mean_token_accuracy": 0.7293244004249573, "num_tokens": 32744082.0, "step": 534 }, { "entropy": 1.0068359375, "epoch": 1.926961226330027, "grad_norm": 0.15325218439102173, "learning_rate": 0.0003079136690647482, "loss": 0.9471, "mean_token_accuracy": 0.7374262660741806, "num_tokens": 32805274.0, "step": 535 }, { "entropy": 1.01953125, "epoch": 1.9305680793507665, "grad_norm": 0.15475596487522125, "learning_rate": 0.00030755395683453236, "loss": 0.9565, "mean_token_accuracy": 0.734167605638504, "num_tokens": 32867059.0, "step": 536 }, { "entropy": 1.0625, "epoch": 1.9341749323715058, "grad_norm": 0.15873922407627106, "learning_rate": 0.00030719424460431657, "loss": 1.0071, "mean_token_accuracy": 0.7254810929298401, "num_tokens": 32928967.0, "step": 537 }, { "entropy": 1.0390625, "epoch": 1.9377817853922452, "grad_norm": 0.16191133856773376, "learning_rate": 0.0003068345323741008, "loss": 1.0004, "mean_token_accuracy": 0.725796177983284, "num_tokens": 32987281.0, "step": 538 }, { "entropy": 1.025390625, "epoch": 1.9413886384129846, "grad_norm": 0.15287062525749207, "learning_rate": 0.0003064748201438849, "loss": 0.9699, "mean_token_accuracy": 0.7358278334140778, "num_tokens": 33048916.0, "step": 539 }, { "entropy": 1.033203125, "epoch": 1.944995491433724, "grad_norm": 0.15776167809963226, "learning_rate": 0.0003061151079136691, "loss": 0.995, "mean_token_accuracy": 0.7278575152158737, "num_tokens": 33107898.0, "step": 540 }, { "entropy": 1.005859375, "epoch": 1.9486023444544633, "grad_norm": 0.1525508612394333, "learning_rate": 0.00030575539568345324, "loss": 0.9751, "mean_token_accuracy": 0.7302428632974625, "num_tokens": 33169762.0, "step": 541 }, { "entropy": 1.009765625, "epoch": 1.952209197475203, "grad_norm": 0.16484840214252472, "learning_rate": 0.00030539568345323744, "loss": 0.9846, "mean_token_accuracy": 0.7289550602436066, "num_tokens": 33231941.0, "step": 542 }, { "entropy": 1.03125, "epoch": 1.9558160504959423, "grad_norm": 0.15907856822013855, "learning_rate": 0.00030503597122302154, "loss": 0.9985, "mean_token_accuracy": 0.7261295914649963, "num_tokens": 33294265.0, "step": 543 }, { "entropy": 1.041015625, "epoch": 1.9594229035166817, "grad_norm": 0.1595824807882309, "learning_rate": 0.00030467625899280575, "loss": 1.006, "mean_token_accuracy": 0.7278691828250885, "num_tokens": 33352951.0, "step": 544 }, { "entropy": 1.0283203125, "epoch": 1.963029756537421, "grad_norm": 0.16781815886497498, "learning_rate": 0.0003043165467625899, "loss": 0.9574, "mean_token_accuracy": 0.7339589148759842, "num_tokens": 33413589.0, "step": 545 }, { "entropy": 1.044921875, "epoch": 1.9666366095581607, "grad_norm": 0.1574292778968811, "learning_rate": 0.0003039568345323741, "loss": 0.9653, "mean_token_accuracy": 0.7325973063707352, "num_tokens": 33475383.0, "step": 546 }, { "entropy": 1.0048828125, "epoch": 1.9702434625789, "grad_norm": 0.1566307544708252, "learning_rate": 0.0003035971223021583, "loss": 0.95, "mean_token_accuracy": 0.7362354099750519, "num_tokens": 33537221.0, "step": 547 }, { "entropy": 0.998046875, "epoch": 1.9738503155996394, "grad_norm": 0.15702128410339355, "learning_rate": 0.0003032374100719424, "loss": 0.9563, "mean_token_accuracy": 0.7335593998432159, "num_tokens": 33597436.0, "step": 548 }, { "entropy": 1.0107421875, "epoch": 1.9774571686203788, "grad_norm": 0.1655985563993454, "learning_rate": 0.00030287769784172663, "loss": 0.9789, "mean_token_accuracy": 0.7305042445659637, "num_tokens": 33656438.0, "step": 549 }, { "entropy": 1.01953125, "epoch": 1.9810640216411182, "grad_norm": 0.16354624927043915, "learning_rate": 0.0003025179856115108, "loss": 0.9922, "mean_token_accuracy": 0.7277569770812988, "num_tokens": 33718970.0, "step": 550 }, { "entropy": 1.015625, "epoch": 1.9846708746618575, "grad_norm": 0.15630453824996948, "learning_rate": 0.000302158273381295, "loss": 0.9713, "mean_token_accuracy": 0.732896015048027, "num_tokens": 33781660.0, "step": 551 }, { "entropy": 1.021484375, "epoch": 1.988277727682597, "grad_norm": 0.15785540640354156, "learning_rate": 0.0003017985611510791, "loss": 0.9663, "mean_token_accuracy": 0.7303127646446228, "num_tokens": 33841073.0, "step": 552 }, { "entropy": 1.02734375, "epoch": 1.9918845807033363, "grad_norm": 0.1618838906288147, "learning_rate": 0.0003014388489208633, "loss": 0.9656, "mean_token_accuracy": 0.733149915933609, "num_tokens": 33900513.0, "step": 553 }, { "entropy": 1.0400390625, "epoch": 1.9954914337240757, "grad_norm": 0.16209132969379425, "learning_rate": 0.0003010791366906475, "loss": 0.9911, "mean_token_accuracy": 0.7273791432380676, "num_tokens": 33960318.0, "step": 554 }, { "entropy": 1.041015625, "epoch": 1.999098286744815, "grad_norm": 0.15763705968856812, "learning_rate": 0.00030071942446043167, "loss": 0.9843, "mean_token_accuracy": 0.7308461219072342, "num_tokens": 34022000.0, "step": 555 }, { "entropy": 1.0390625, "epoch": 2.0, "grad_norm": 0.2962213158607483, "learning_rate": 0.0003003597122302159, "loss": 0.949, "mean_token_accuracy": 0.7406987547874451, "num_tokens": 34037889.0, "step": 556 }, { "epoch": 2.0, "eval_entropy": 1.046875, "eval_loss": 1.1196398735046387, "eval_mean_token_accuracy": 0.7030805945396423, "eval_num_tokens": 34037889.0, "eval_runtime": 1.9903, "eval_samples_per_second": 25.121, "eval_steps_per_second": 1.005, "step": 556 }, { "entropy": 0.9287109375, "epoch": 2.0036068530207394, "grad_norm": 0.18578214943408966, "learning_rate": 0.0003, "loss": 0.7754, "mean_token_accuracy": 0.7810672372579575, "num_tokens": 34100033.0, "step": 557 }, { "entropy": 0.873046875, "epoch": 2.0072137060414788, "grad_norm": 0.182692751288414, "learning_rate": 0.0002996402877697842, "loss": 0.7606, "mean_token_accuracy": 0.7850523293018341, "num_tokens": 34160627.0, "step": 558 }, { "entropy": 0.8203125, "epoch": 2.010820559062218, "grad_norm": 0.16979150474071503, "learning_rate": 0.00029928057553956834, "loss": 0.7592, "mean_token_accuracy": 0.787190780043602, "num_tokens": 34220581.0, "step": 559 }, { "entropy": 0.775390625, "epoch": 2.0144274120829575, "grad_norm": 0.17777714133262634, "learning_rate": 0.00029892086330935254, "loss": 0.7405, "mean_token_accuracy": 0.789994403719902, "num_tokens": 34280383.0, "step": 560 }, { "entropy": 0.76953125, "epoch": 2.018034265103697, "grad_norm": 0.1805938482284546, "learning_rate": 0.0002985611510791367, "loss": 0.7317, "mean_token_accuracy": 0.7920661866664886, "num_tokens": 34342367.0, "step": 561 }, { "entropy": 0.7900390625, "epoch": 2.0216411181244363, "grad_norm": 0.18841806054115295, "learning_rate": 0.00029820143884892085, "loss": 0.759, "mean_token_accuracy": 0.786965399980545, "num_tokens": 34405523.0, "step": 562 }, { "entropy": 0.8046875, "epoch": 2.0252479711451756, "grad_norm": 0.1804211139678955, "learning_rate": 0.00029784172661870506, "loss": 0.7456, "mean_token_accuracy": 0.7878299057483673, "num_tokens": 34465906.0, "step": 563 }, { "entropy": 0.849609375, "epoch": 2.0288548241659154, "grad_norm": 0.17306514084339142, "learning_rate": 0.0002974820143884892, "loss": 0.7685, "mean_token_accuracy": 0.7825770825147629, "num_tokens": 34527157.0, "step": 564 }, { "entropy": 0.85546875, "epoch": 2.032461677186655, "grad_norm": 0.16538429260253906, "learning_rate": 0.0002971223021582734, "loss": 0.7534, "mean_token_accuracy": 0.7844157963991165, "num_tokens": 34589897.0, "step": 565 }, { "entropy": 0.87109375, "epoch": 2.036068530207394, "grad_norm": 0.16972213983535767, "learning_rate": 0.0002967625899280575, "loss": 0.7706, "mean_token_accuracy": 0.7842663824558258, "num_tokens": 34651140.0, "step": 566 }, { "entropy": 0.8349609375, "epoch": 2.0396753832281336, "grad_norm": 0.16319338977336884, "learning_rate": 0.00029640287769784173, "loss": 0.7617, "mean_token_accuracy": 0.7824758440256119, "num_tokens": 34712787.0, "step": 567 }, { "entropy": 0.810546875, "epoch": 2.043282236248873, "grad_norm": 0.17336887121200562, "learning_rate": 0.0002960431654676259, "loss": 0.7249, "mean_token_accuracy": 0.7911884039640427, "num_tokens": 34774692.0, "step": 568 }, { "entropy": 0.814453125, "epoch": 2.0468890892696123, "grad_norm": 0.17789015173912048, "learning_rate": 0.0002956834532374101, "loss": 0.7537, "mean_token_accuracy": 0.7856712937355042, "num_tokens": 34837440.0, "step": 569 }, { "entropy": 0.8271484375, "epoch": 2.0504959422903517, "grad_norm": 0.19014592468738556, "learning_rate": 0.00029532374100719425, "loss": 0.7702, "mean_token_accuracy": 0.7835455387830734, "num_tokens": 34897588.0, "step": 570 }, { "entropy": 0.7890625, "epoch": 2.054102795311091, "grad_norm": 0.16571538150310516, "learning_rate": 0.0002949640287769784, "loss": 0.7219, "mean_token_accuracy": 0.7937293648719788, "num_tokens": 34958120.0, "step": 571 }, { "entropy": 0.8173828125, "epoch": 2.0577096483318305, "grad_norm": 0.17018122971057892, "learning_rate": 0.0002946043165467626, "loss": 0.7563, "mean_token_accuracy": 0.7846188247203827, "num_tokens": 35018072.0, "step": 572 }, { "entropy": 0.8369140625, "epoch": 2.06131650135257, "grad_norm": 0.17384840548038483, "learning_rate": 0.00029424460431654677, "loss": 0.7844, "mean_token_accuracy": 0.7789180725812912, "num_tokens": 35079060.0, "step": 573 }, { "entropy": 0.8330078125, "epoch": 2.064923354373309, "grad_norm": 0.17138977348804474, "learning_rate": 0.000293884892086331, "loss": 0.7635, "mean_token_accuracy": 0.7843373864889145, "num_tokens": 35142006.0, "step": 574 }, { "entropy": 0.80859375, "epoch": 2.0685302073940486, "grad_norm": 0.16534361243247986, "learning_rate": 0.0002935251798561151, "loss": 0.7291, "mean_token_accuracy": 0.7933791279792786, "num_tokens": 35204684.0, "step": 575 }, { "entropy": 0.8330078125, "epoch": 2.072137060414788, "grad_norm": 0.1642112135887146, "learning_rate": 0.0002931654676258993, "loss": 0.7709, "mean_token_accuracy": 0.7824510484933853, "num_tokens": 35265995.0, "step": 576 }, { "entropy": 0.81640625, "epoch": 2.0757439134355273, "grad_norm": 0.1663014441728592, "learning_rate": 0.00029280575539568344, "loss": 0.738, "mean_token_accuracy": 0.7911092340946198, "num_tokens": 35326624.0, "step": 577 }, { "entropy": 0.8095703125, "epoch": 2.0793507664562667, "grad_norm": 0.1753828525543213, "learning_rate": 0.00029244604316546764, "loss": 0.7469, "mean_token_accuracy": 0.7901962846517563, "num_tokens": 35388345.0, "step": 578 }, { "entropy": 0.794921875, "epoch": 2.082957619477006, "grad_norm": 0.16781604290008545, "learning_rate": 0.00029208633093525185, "loss": 0.7255, "mean_token_accuracy": 0.7923619151115417, "num_tokens": 35451271.0, "step": 579 }, { "entropy": 0.7861328125, "epoch": 2.086564472497746, "grad_norm": 0.17224448919296265, "learning_rate": 0.00029172661870503595, "loss": 0.7216, "mean_token_accuracy": 0.7940124869346619, "num_tokens": 35513761.0, "step": 580 }, { "entropy": 0.818359375, "epoch": 2.0901713255184853, "grad_norm": 0.17551769316196442, "learning_rate": 0.00029136690647482016, "loss": 0.7649, "mean_token_accuracy": 0.782198429107666, "num_tokens": 35574467.0, "step": 581 }, { "entropy": 0.82421875, "epoch": 2.0937781785392247, "grad_norm": 0.17525380849838257, "learning_rate": 0.0002910071942446043, "loss": 0.7621, "mean_token_accuracy": 0.7855977416038513, "num_tokens": 35634911.0, "step": 582 }, { "entropy": 0.849609375, "epoch": 2.097385031559964, "grad_norm": 0.17407967150211334, "learning_rate": 0.0002906474820143885, "loss": 0.7719, "mean_token_accuracy": 0.7826326787471771, "num_tokens": 35696466.0, "step": 583 }, { "entropy": 0.8203125, "epoch": 2.1009918845807034, "grad_norm": 0.17458270490169525, "learning_rate": 0.0002902877697841726, "loss": 0.7469, "mean_token_accuracy": 0.7868435829877853, "num_tokens": 35757947.0, "step": 584 }, { "entropy": 0.833984375, "epoch": 2.104598737601443, "grad_norm": 0.17013199627399445, "learning_rate": 0.00028992805755395683, "loss": 0.7734, "mean_token_accuracy": 0.7815344482660294, "num_tokens": 35819814.0, "step": 585 }, { "entropy": 0.798828125, "epoch": 2.108205590622182, "grad_norm": 0.1657773107290268, "learning_rate": 0.000289568345323741, "loss": 0.7275, "mean_token_accuracy": 0.7936546951532364, "num_tokens": 35882588.0, "step": 586 }, { "entropy": 0.8369140625, "epoch": 2.1118124436429215, "grad_norm": 0.1734456717967987, "learning_rate": 0.0002892086330935252, "loss": 0.7682, "mean_token_accuracy": 0.7854891121387482, "num_tokens": 35945234.0, "step": 587 }, { "entropy": 0.80859375, "epoch": 2.115419296663661, "grad_norm": 0.17951053380966187, "learning_rate": 0.0002888489208633094, "loss": 0.7522, "mean_token_accuracy": 0.7848480194807053, "num_tokens": 36005340.0, "step": 588 }, { "entropy": 0.80859375, "epoch": 2.1190261496844003, "grad_norm": 0.17793695628643036, "learning_rate": 0.0002884892086330935, "loss": 0.7401, "mean_token_accuracy": 0.7884912341833115, "num_tokens": 36067439.0, "step": 589 }, { "entropy": 0.806640625, "epoch": 2.1226330027051397, "grad_norm": 0.17605239152908325, "learning_rate": 0.0002881294964028777, "loss": 0.7445, "mean_token_accuracy": 0.7905792742967606, "num_tokens": 36130770.0, "step": 590 }, { "entropy": 0.779296875, "epoch": 2.126239855725879, "grad_norm": 0.18497562408447266, "learning_rate": 0.00028776978417266187, "loss": 0.7301, "mean_token_accuracy": 0.7893381118774414, "num_tokens": 36192162.0, "step": 591 }, { "entropy": 0.7783203125, "epoch": 2.1298467087466184, "grad_norm": 0.17526225745677948, "learning_rate": 0.0002874100719424461, "loss": 0.7077, "mean_token_accuracy": 0.7960422486066818, "num_tokens": 36254775.0, "step": 592 }, { "entropy": 0.78125, "epoch": 2.133453561767358, "grad_norm": 0.1672462522983551, "learning_rate": 0.00028705035971223023, "loss": 0.6987, "mean_token_accuracy": 0.7994558811187744, "num_tokens": 36315830.0, "step": 593 }, { "entropy": 0.81640625, "epoch": 2.137060414788097, "grad_norm": 0.16928352415561676, "learning_rate": 0.0002866906474820144, "loss": 0.7506, "mean_token_accuracy": 0.787496954202652, "num_tokens": 36378350.0, "step": 594 }, { "entropy": 0.8359375, "epoch": 2.140667267808837, "grad_norm": 0.18231336772441864, "learning_rate": 0.00028633093525179854, "loss": 0.7727, "mean_token_accuracy": 0.7810271829366684, "num_tokens": 36438874.0, "step": 595 }, { "entropy": 0.7919921875, "epoch": 2.1442741208295764, "grad_norm": 0.17626817524433136, "learning_rate": 0.00028597122302158275, "loss": 0.7253, "mean_token_accuracy": 0.7941576391458511, "num_tokens": 36499905.0, "step": 596 }, { "entropy": 0.8046875, "epoch": 2.1478809738503157, "grad_norm": 0.16970813274383545, "learning_rate": 0.00028561151079136695, "loss": 0.7363, "mean_token_accuracy": 0.7911288440227509, "num_tokens": 36561110.0, "step": 597 }, { "entropy": 0.78515625, "epoch": 2.151487826871055, "grad_norm": 0.1732400357723236, "learning_rate": 0.00028525179856115105, "loss": 0.7296, "mean_token_accuracy": 0.7943115532398224, "num_tokens": 36622080.0, "step": 598 }, { "entropy": 0.79296875, "epoch": 2.1550946798917945, "grad_norm": 0.17434056103229523, "learning_rate": 0.00028489208633093526, "loss": 0.7464, "mean_token_accuracy": 0.7845259010791779, "num_tokens": 36682703.0, "step": 599 }, { "entropy": 0.80078125, "epoch": 2.158701532912534, "grad_norm": 0.17977698147296906, "learning_rate": 0.0002845323741007194, "loss": 0.7314, "mean_token_accuracy": 0.7918253839015961, "num_tokens": 36743715.0, "step": 600 }, { "entropy": 0.830078125, "epoch": 2.1623083859332732, "grad_norm": 0.1823503077030182, "learning_rate": 0.0002841726618705036, "loss": 0.7676, "mean_token_accuracy": 0.7845704108476639, "num_tokens": 36804356.0, "step": 601 }, { "entropy": 0.8037109375, "epoch": 2.1659152389540126, "grad_norm": 0.17771784961223602, "learning_rate": 0.0002838129496402878, "loss": 0.7327, "mean_token_accuracy": 0.7898932844400406, "num_tokens": 36864477.0, "step": 602 }, { "entropy": 0.8388671875, "epoch": 2.169522091974752, "grad_norm": 0.1720096915960312, "learning_rate": 0.00028345323741007193, "loss": 0.7481, "mean_token_accuracy": 0.7871522754430771, "num_tokens": 36925606.0, "step": 603 }, { "entropy": 0.828125, "epoch": 2.1731289449954914, "grad_norm": 0.1775212287902832, "learning_rate": 0.00028309352517985614, "loss": 0.7453, "mean_token_accuracy": 0.7882642298936844, "num_tokens": 36985195.0, "step": 604 }, { "entropy": 0.8427734375, "epoch": 2.1767357980162307, "grad_norm": 0.17514289915561676, "learning_rate": 0.0002827338129496403, "loss": 0.7778, "mean_token_accuracy": 0.7814425081014633, "num_tokens": 37045480.0, "step": 605 }, { "entropy": 0.814453125, "epoch": 2.18034265103697, "grad_norm": 0.17965127527713776, "learning_rate": 0.0002823741007194245, "loss": 0.7654, "mean_token_accuracy": 0.7816276848316193, "num_tokens": 37107219.0, "step": 606 }, { "entropy": 0.771484375, "epoch": 2.1839495040577095, "grad_norm": 0.17314137518405914, "learning_rate": 0.0002820143884892086, "loss": 0.7333, "mean_token_accuracy": 0.7895840853452682, "num_tokens": 37168661.0, "step": 607 }, { "entropy": 0.7744140625, "epoch": 2.187556357078449, "grad_norm": 0.17360344529151917, "learning_rate": 0.0002816546762589928, "loss": 0.7177, "mean_token_accuracy": 0.7921271920204163, "num_tokens": 37230796.0, "step": 608 }, { "entropy": 0.8076171875, "epoch": 2.1911632100991882, "grad_norm": 0.17653188109397888, "learning_rate": 0.00028129496402877697, "loss": 0.7321, "mean_token_accuracy": 0.7900026440620422, "num_tokens": 37291972.0, "step": 609 }, { "entropy": 0.8427734375, "epoch": 2.194770063119928, "grad_norm": 0.181155264377594, "learning_rate": 0.0002809352517985612, "loss": 0.7673, "mean_token_accuracy": 0.7858395129442215, "num_tokens": 37353729.0, "step": 610 }, { "entropy": 0.810546875, "epoch": 2.1983769161406674, "grad_norm": 0.178135946393013, "learning_rate": 0.00028057553956834533, "loss": 0.7226, "mean_token_accuracy": 0.7946770340204239, "num_tokens": 37415534.0, "step": 611 }, { "entropy": 0.8232421875, "epoch": 2.201983769161407, "grad_norm": 0.182571142911911, "learning_rate": 0.0002802158273381295, "loss": 0.7625, "mean_token_accuracy": 0.7832108587026596, "num_tokens": 37476671.0, "step": 612 }, { "entropy": 0.833984375, "epoch": 2.205590622182146, "grad_norm": 0.17927835881710052, "learning_rate": 0.0002798561151079137, "loss": 0.7605, "mean_token_accuracy": 0.7847219407558441, "num_tokens": 37538685.0, "step": 613 }, { "entropy": 0.830078125, "epoch": 2.2091974752028856, "grad_norm": 0.1767112761735916, "learning_rate": 0.00027949640287769785, "loss": 0.7699, "mean_token_accuracy": 0.7830337285995483, "num_tokens": 37600022.0, "step": 614 }, { "entropy": 0.7919921875, "epoch": 2.212804328223625, "grad_norm": 0.1755734384059906, "learning_rate": 0.00027913669064748205, "loss": 0.7385, "mean_token_accuracy": 0.7895809859037399, "num_tokens": 37662815.0, "step": 615 }, { "entropy": 0.7958984375, "epoch": 2.2164111812443643, "grad_norm": 0.17812463641166687, "learning_rate": 0.00027877697841726615, "loss": 0.7433, "mean_token_accuracy": 0.7880703806877136, "num_tokens": 37725290.0, "step": 616 }, { "entropy": 0.810546875, "epoch": 2.2200180342651037, "grad_norm": 0.18269549310207367, "learning_rate": 0.00027841726618705036, "loss": 0.7583, "mean_token_accuracy": 0.7836417406797409, "num_tokens": 37786556.0, "step": 617 }, { "entropy": 0.7763671875, "epoch": 2.223624887285843, "grad_norm": 0.17176507413387299, "learning_rate": 0.0002780575539568345, "loss": 0.7154, "mean_token_accuracy": 0.7968256771564484, "num_tokens": 37848776.0, "step": 618 }, { "entropy": 0.810546875, "epoch": 2.2272317403065824, "grad_norm": 0.18192490935325623, "learning_rate": 0.0002776978417266187, "loss": 0.751, "mean_token_accuracy": 0.7858532518148422, "num_tokens": 37910292.0, "step": 619 }, { "entropy": 0.8203125, "epoch": 2.230838593327322, "grad_norm": 0.17693746089935303, "learning_rate": 0.0002773381294964029, "loss": 0.7406, "mean_token_accuracy": 0.7926157712936401, "num_tokens": 37972769.0, "step": 620 }, { "entropy": 0.8310546875, "epoch": 2.234445446348061, "grad_norm": 0.19954904913902283, "learning_rate": 0.00027697841726618703, "loss": 0.768, "mean_token_accuracy": 0.7841173261404037, "num_tokens": 38034521.0, "step": 621 }, { "entropy": 0.794921875, "epoch": 2.2380522993688006, "grad_norm": 0.17349690198898315, "learning_rate": 0.00027661870503597124, "loss": 0.7314, "mean_token_accuracy": 0.7913317531347275, "num_tokens": 38097025.0, "step": 622 }, { "entropy": 0.787109375, "epoch": 2.24165915238954, "grad_norm": 0.17653638124465942, "learning_rate": 0.0002762589928057554, "loss": 0.7075, "mean_token_accuracy": 0.799065351486206, "num_tokens": 38158086.0, "step": 623 }, { "entropy": 0.787109375, "epoch": 2.2452660054102793, "grad_norm": 0.17829859256744385, "learning_rate": 0.0002758992805755396, "loss": 0.7304, "mean_token_accuracy": 0.7894476652145386, "num_tokens": 38220325.0, "step": 624 }, { "entropy": 0.8203125, "epoch": 2.248872858431019, "grad_norm": 0.18479490280151367, "learning_rate": 0.00027553956834532376, "loss": 0.7664, "mean_token_accuracy": 0.7843069732189178, "num_tokens": 38282225.0, "step": 625 }, { "entropy": 0.7724609375, "epoch": 2.2524797114517585, "grad_norm": 0.17846599221229553, "learning_rate": 0.0002751798561151079, "loss": 0.7061, "mean_token_accuracy": 0.7978968620300293, "num_tokens": 38346033.0, "step": 626 }, { "entropy": 0.8125, "epoch": 2.256086564472498, "grad_norm": 0.17428778111934662, "learning_rate": 0.00027482014388489207, "loss": 0.7527, "mean_token_accuracy": 0.7874512374401093, "num_tokens": 38407255.0, "step": 627 }, { "entropy": 0.8154296875, "epoch": 2.2596934174932373, "grad_norm": 0.17659594118595123, "learning_rate": 0.0002744604316546763, "loss": 0.7476, "mean_token_accuracy": 0.7883879691362381, "num_tokens": 38468378.0, "step": 628 }, { "entropy": 0.822265625, "epoch": 2.2633002705139766, "grad_norm": 0.18058140575885773, "learning_rate": 0.00027410071942446043, "loss": 0.7538, "mean_token_accuracy": 0.7871584445238113, "num_tokens": 38528855.0, "step": 629 }, { "entropy": 0.8388671875, "epoch": 2.266907123534716, "grad_norm": 0.18415984511375427, "learning_rate": 0.0002737410071942446, "loss": 0.7845, "mean_token_accuracy": 0.7770460993051529, "num_tokens": 38589881.0, "step": 630 }, { "entropy": 0.8310546875, "epoch": 2.2705139765554554, "grad_norm": 0.18049819767475128, "learning_rate": 0.0002733812949640288, "loss": 0.7589, "mean_token_accuracy": 0.7873687297105789, "num_tokens": 38650480.0, "step": 631 }, { "entropy": 0.80078125, "epoch": 2.2741208295761948, "grad_norm": 0.17625480890274048, "learning_rate": 0.00027302158273381295, "loss": 0.7302, "mean_token_accuracy": 0.794575423002243, "num_tokens": 38712715.0, "step": 632 }, { "entropy": 0.806640625, "epoch": 2.277727682596934, "grad_norm": 0.18563911318778992, "learning_rate": 0.00027266187050359715, "loss": 0.7468, "mean_token_accuracy": 0.7871198803186417, "num_tokens": 38773942.0, "step": 633 }, { "entropy": 0.8095703125, "epoch": 2.2813345356176735, "grad_norm": 0.20236892998218536, "learning_rate": 0.0002723021582733813, "loss": 0.7701, "mean_token_accuracy": 0.7833315581083298, "num_tokens": 38837133.0, "step": 634 }, { "entropy": 0.8232421875, "epoch": 2.284941388638413, "grad_norm": 0.1783973127603531, "learning_rate": 0.00027194244604316546, "loss": 0.7701, "mean_token_accuracy": 0.7837322652339935, "num_tokens": 38899936.0, "step": 635 }, { "entropy": 0.810546875, "epoch": 2.2885482416591523, "grad_norm": 0.17648401856422424, "learning_rate": 0.0002715827338129496, "loss": 0.7408, "mean_token_accuracy": 0.7896586209535599, "num_tokens": 38961444.0, "step": 636 }, { "entropy": 0.865234375, "epoch": 2.2921550946798916, "grad_norm": 0.18382835388183594, "learning_rate": 0.0002712230215827338, "loss": 0.7858, "mean_token_accuracy": 0.7781921476125717, "num_tokens": 39020427.0, "step": 637 }, { "entropy": 0.86328125, "epoch": 2.295761947700631, "grad_norm": 0.1770496964454651, "learning_rate": 0.00027086330935251803, "loss": 0.7862, "mean_token_accuracy": 0.7789389193058014, "num_tokens": 39080398.0, "step": 638 }, { "entropy": 0.84375, "epoch": 2.2993688007213704, "grad_norm": 0.17726635932922363, "learning_rate": 0.00027050359712230213, "loss": 0.7625, "mean_token_accuracy": 0.7854194343090057, "num_tokens": 39141567.0, "step": 639 }, { "entropy": 0.8056640625, "epoch": 2.30297565374211, "grad_norm": 0.18265493214130402, "learning_rate": 0.00027014388489208634, "loss": 0.7563, "mean_token_accuracy": 0.7864619940519333, "num_tokens": 39201921.0, "step": 640 }, { "entropy": 0.806640625, "epoch": 2.3065825067628496, "grad_norm": 0.17957665026187897, "learning_rate": 0.0002697841726618705, "loss": 0.759, "mean_token_accuracy": 0.7814882099628448, "num_tokens": 39262331.0, "step": 641 }, { "entropy": 0.78515625, "epoch": 2.310189359783589, "grad_norm": 0.1859419345855713, "learning_rate": 0.0002694244604316547, "loss": 0.7349, "mean_token_accuracy": 0.7915872484445572, "num_tokens": 39322798.0, "step": 642 }, { "entropy": 0.7978515625, "epoch": 2.3137962128043283, "grad_norm": 0.19019371271133423, "learning_rate": 0.00026906474820143886, "loss": 0.7461, "mean_token_accuracy": 0.7890315800905228, "num_tokens": 39384673.0, "step": 643 }, { "entropy": 0.8271484375, "epoch": 2.3174030658250677, "grad_norm": 0.1849343627691269, "learning_rate": 0.000268705035971223, "loss": 0.7688, "mean_token_accuracy": 0.7834508270025253, "num_tokens": 39444648.0, "step": 644 }, { "entropy": 0.8056640625, "epoch": 2.321009918845807, "grad_norm": 0.17757967114448547, "learning_rate": 0.00026834532374100717, "loss": 0.7268, "mean_token_accuracy": 0.793069139122963, "num_tokens": 39506594.0, "step": 645 }, { "entropy": 0.828125, "epoch": 2.3246167718665465, "grad_norm": 0.18236634135246277, "learning_rate": 0.0002679856115107914, "loss": 0.7655, "mean_token_accuracy": 0.7819109708070755, "num_tokens": 39567680.0, "step": 646 }, { "entropy": 0.8125, "epoch": 2.328223624887286, "grad_norm": 0.18062734603881836, "learning_rate": 0.0002676258992805756, "loss": 0.7502, "mean_token_accuracy": 0.7865216583013535, "num_tokens": 39627118.0, "step": 647 }, { "entropy": 0.798828125, "epoch": 2.3318304779080252, "grad_norm": 0.17353413999080658, "learning_rate": 0.0002672661870503597, "loss": 0.728, "mean_token_accuracy": 0.7930381447076797, "num_tokens": 39688253.0, "step": 648 }, { "entropy": 0.7841796875, "epoch": 2.3354373309287646, "grad_norm": 0.17404480278491974, "learning_rate": 0.0002669064748201439, "loss": 0.7276, "mean_token_accuracy": 0.7932890057563782, "num_tokens": 39750444.0, "step": 649 }, { "entropy": 0.818359375, "epoch": 2.339044183949504, "grad_norm": 0.18468070030212402, "learning_rate": 0.00026654676258992805, "loss": 0.7634, "mean_token_accuracy": 0.7842660397291183, "num_tokens": 39811341.0, "step": 650 }, { "entropy": 0.796875, "epoch": 2.3426510369702434, "grad_norm": 0.17830346524715424, "learning_rate": 0.00026618705035971225, "loss": 0.7465, "mean_token_accuracy": 0.7903191894292831, "num_tokens": 39873422.0, "step": 651 }, { "entropy": 0.8232421875, "epoch": 2.3462578899909827, "grad_norm": 0.1852332353591919, "learning_rate": 0.0002658273381294964, "loss": 0.7663, "mean_token_accuracy": 0.7851741462945938, "num_tokens": 39934688.0, "step": 652 }, { "entropy": 0.7900390625, "epoch": 2.349864743011722, "grad_norm": 0.19075137376785278, "learning_rate": 0.00026546762589928056, "loss": 0.7454, "mean_token_accuracy": 0.7875671684741974, "num_tokens": 39994806.0, "step": 653 }, { "entropy": 0.814453125, "epoch": 2.3534715960324615, "grad_norm": 0.18778453767299652, "learning_rate": 0.0002651079136690647, "loss": 0.744, "mean_token_accuracy": 0.7911556959152222, "num_tokens": 40054861.0, "step": 654 }, { "entropy": 0.8291015625, "epoch": 2.3570784490532013, "grad_norm": 0.17905113101005554, "learning_rate": 0.0002647482014388489, "loss": 0.7474, "mean_token_accuracy": 0.7888215035200119, "num_tokens": 40117279.0, "step": 655 }, { "entropy": 0.8125, "epoch": 2.3606853020739407, "grad_norm": 0.17837324738502502, "learning_rate": 0.00026438848920863313, "loss": 0.7416, "mean_token_accuracy": 0.7889131307601929, "num_tokens": 40178359.0, "step": 656 }, { "entropy": 0.818359375, "epoch": 2.36429215509468, "grad_norm": 0.18312481045722961, "learning_rate": 0.0002640287769784173, "loss": 0.7392, "mean_token_accuracy": 0.7901452481746674, "num_tokens": 40239060.0, "step": 657 }, { "entropy": 0.798828125, "epoch": 2.3678990081154194, "grad_norm": 0.18298989534378052, "learning_rate": 0.00026366906474820144, "loss": 0.7338, "mean_token_accuracy": 0.7924284487962723, "num_tokens": 40299746.0, "step": 658 }, { "entropy": 0.796875, "epoch": 2.371505861136159, "grad_norm": 0.19151635468006134, "learning_rate": 0.0002633093525179856, "loss": 0.7453, "mean_token_accuracy": 0.7898814529180527, "num_tokens": 40359289.0, "step": 659 }, { "entropy": 0.7822265625, "epoch": 2.375112714156898, "grad_norm": 0.1924605667591095, "learning_rate": 0.0002629496402877698, "loss": 0.7348, "mean_token_accuracy": 0.7912735342979431, "num_tokens": 40419790.0, "step": 660 }, { "entropy": 0.833984375, "epoch": 2.3787195671776376, "grad_norm": 0.2051534205675125, "learning_rate": 0.00026258992805755396, "loss": 0.7931, "mean_token_accuracy": 0.7752101421356201, "num_tokens": 40481689.0, "step": 661 }, { "entropy": 0.77734375, "epoch": 2.382326420198377, "grad_norm": 0.1781076341867447, "learning_rate": 0.0002622302158273381, "loss": 0.7261, "mean_token_accuracy": 0.7933123856782913, "num_tokens": 40543246.0, "step": 662 }, { "entropy": 0.7958984375, "epoch": 2.3859332732191163, "grad_norm": 0.18318699300289154, "learning_rate": 0.0002618705035971223, "loss": 0.7179, "mean_token_accuracy": 0.7979258298873901, "num_tokens": 40605728.0, "step": 663 }, { "entropy": 0.818359375, "epoch": 2.3895401262398557, "grad_norm": 0.18104508519172668, "learning_rate": 0.0002615107913669065, "loss": 0.7616, "mean_token_accuracy": 0.7839236855506897, "num_tokens": 40666445.0, "step": 664 }, { "entropy": 0.8212890625, "epoch": 2.393146979260595, "grad_norm": 0.18478626012802124, "learning_rate": 0.0002611510791366907, "loss": 0.763, "mean_token_accuracy": 0.7842493057250977, "num_tokens": 40726617.0, "step": 665 }, { "entropy": 0.80078125, "epoch": 2.3967538322813344, "grad_norm": 0.18380814790725708, "learning_rate": 0.00026079136690647484, "loss": 0.7391, "mean_token_accuracy": 0.787474051117897, "num_tokens": 40788457.0, "step": 666 }, { "entropy": 0.7919921875, "epoch": 2.400360685302074, "grad_norm": 0.17584605515003204, "learning_rate": 0.000260431654676259, "loss": 0.736, "mean_token_accuracy": 0.7906964421272278, "num_tokens": 40849722.0, "step": 667 }, { "entropy": 0.8115234375, "epoch": 2.403967538322813, "grad_norm": 0.27526551485061646, "learning_rate": 0.00026007194244604315, "loss": 0.7493, "mean_token_accuracy": 0.7874036282300949, "num_tokens": 40909231.0, "step": 668 }, { "entropy": 0.8291015625, "epoch": 2.4075743913435526, "grad_norm": 0.18538732826709747, "learning_rate": 0.00025971223021582735, "loss": 0.7525, "mean_token_accuracy": 0.78651924431324, "num_tokens": 40969185.0, "step": 669 }, { "entropy": 0.8017578125, "epoch": 2.4111812443642924, "grad_norm": 0.17820103466510773, "learning_rate": 0.0002593525179856115, "loss": 0.7362, "mean_token_accuracy": 0.789430171251297, "num_tokens": 41032266.0, "step": 670 }, { "entropy": 0.806640625, "epoch": 2.4147880973850318, "grad_norm": 0.1807846873998642, "learning_rate": 0.00025899280575539566, "loss": 0.7454, "mean_token_accuracy": 0.7877747714519501, "num_tokens": 41094038.0, "step": 671 }, { "entropy": 0.83203125, "epoch": 2.418394950405771, "grad_norm": 0.18689583241939545, "learning_rate": 0.00025863309352517987, "loss": 0.787, "mean_token_accuracy": 0.7792766243219376, "num_tokens": 41155346.0, "step": 672 }, { "entropy": 0.798828125, "epoch": 2.4220018034265105, "grad_norm": 0.18755735456943512, "learning_rate": 0.000258273381294964, "loss": 0.7535, "mean_token_accuracy": 0.7855059802532196, "num_tokens": 41216738.0, "step": 673 }, { "entropy": 0.818359375, "epoch": 2.42560865644725, "grad_norm": 0.18880711495876312, "learning_rate": 0.00025791366906474823, "loss": 0.7454, "mean_token_accuracy": 0.7908457070589066, "num_tokens": 41277092.0, "step": 674 }, { "entropy": 0.7919921875, "epoch": 2.4292155094679893, "grad_norm": 0.17955949902534485, "learning_rate": 0.0002575539568345324, "loss": 0.7225, "mean_token_accuracy": 0.7963331490755081, "num_tokens": 41339093.0, "step": 675 }, { "entropy": 0.8427734375, "epoch": 2.4328223624887286, "grad_norm": 0.1915508657693863, "learning_rate": 0.00025719424460431654, "loss": 0.7848, "mean_token_accuracy": 0.7786163985729218, "num_tokens": 41398785.0, "step": 676 }, { "entropy": 0.79296875, "epoch": 2.436429215509468, "grad_norm": 0.18558816611766815, "learning_rate": 0.0002568345323741007, "loss": 0.7253, "mean_token_accuracy": 0.793020561337471, "num_tokens": 41459029.0, "step": 677 }, { "entropy": 0.8037109375, "epoch": 2.4400360685302074, "grad_norm": 0.19046945869922638, "learning_rate": 0.0002564748201438849, "loss": 0.76, "mean_token_accuracy": 0.7822255045175552, "num_tokens": 41520639.0, "step": 678 }, { "entropy": 0.787109375, "epoch": 2.4436429215509468, "grad_norm": 0.183790385723114, "learning_rate": 0.00025611510791366906, "loss": 0.7328, "mean_token_accuracy": 0.7890102118253708, "num_tokens": 41581587.0, "step": 679 }, { "entropy": 0.83203125, "epoch": 2.447249774571686, "grad_norm": 0.19064383208751678, "learning_rate": 0.0002557553956834532, "loss": 0.7794, "mean_token_accuracy": 0.7800986170768738, "num_tokens": 41642928.0, "step": 680 }, { "entropy": 0.822265625, "epoch": 2.4508566275924255, "grad_norm": 0.18606357276439667, "learning_rate": 0.0002553956834532374, "loss": 0.7584, "mean_token_accuracy": 0.7842763811349869, "num_tokens": 41704025.0, "step": 681 }, { "entropy": 0.83984375, "epoch": 2.454463480613165, "grad_norm": 0.18718992173671722, "learning_rate": 0.0002550359712230216, "loss": 0.7674, "mean_token_accuracy": 0.7827232927083969, "num_tokens": 41764174.0, "step": 682 }, { "entropy": 0.83203125, "epoch": 2.4580703336339043, "grad_norm": 0.17916011810302734, "learning_rate": 0.0002546762589928058, "loss": 0.7596, "mean_token_accuracy": 0.7844974398612976, "num_tokens": 41825701.0, "step": 683 }, { "entropy": 0.794921875, "epoch": 2.4616771866546436, "grad_norm": 0.18821115791797638, "learning_rate": 0.00025431654676258994, "loss": 0.7254, "mean_token_accuracy": 0.79192815721035, "num_tokens": 41887147.0, "step": 684 }, { "entropy": 0.78515625, "epoch": 2.4652840396753835, "grad_norm": 0.18130646646022797, "learning_rate": 0.0002539568345323741, "loss": 0.7273, "mean_token_accuracy": 0.7922435998916626, "num_tokens": 41948938.0, "step": 685 }, { "entropy": 0.7841796875, "epoch": 2.4688908926961224, "grad_norm": 0.1905827671289444, "learning_rate": 0.00025359712230215825, "loss": 0.7324, "mean_token_accuracy": 0.7925073951482773, "num_tokens": 42010408.0, "step": 686 }, { "entropy": 0.7578125, "epoch": 2.472497745716862, "grad_norm": 0.17550453543663025, "learning_rate": 0.00025323741007194246, "loss": 0.7184, "mean_token_accuracy": 0.794641301035881, "num_tokens": 42072278.0, "step": 687 }, { "entropy": 0.771484375, "epoch": 2.4761045987376016, "grad_norm": 0.1889595091342926, "learning_rate": 0.00025287769784172666, "loss": 0.7234, "mean_token_accuracy": 0.7923927307128906, "num_tokens": 42133680.0, "step": 688 }, { "entropy": 0.8134765625, "epoch": 2.479711451758341, "grad_norm": 0.19415535032749176, "learning_rate": 0.0002525179856115108, "loss": 0.7572, "mean_token_accuracy": 0.7856470048427582, "num_tokens": 42194672.0, "step": 689 }, { "entropy": 0.8388671875, "epoch": 2.4833183047790803, "grad_norm": 0.1968717724084854, "learning_rate": 0.00025215827338129497, "loss": 0.7657, "mean_token_accuracy": 0.7825149297714233, "num_tokens": 42255675.0, "step": 690 }, { "entropy": 0.8525390625, "epoch": 2.4869251577998197, "grad_norm": 0.1847522109746933, "learning_rate": 0.0002517985611510791, "loss": 0.7723, "mean_token_accuracy": 0.7832875847816467, "num_tokens": 42317404.0, "step": 691 }, { "entropy": 0.83203125, "epoch": 2.490532010820559, "grad_norm": 0.1932613104581833, "learning_rate": 0.00025143884892086333, "loss": 0.7558, "mean_token_accuracy": 0.7859340161085129, "num_tokens": 42377156.0, "step": 692 }, { "entropy": 0.80859375, "epoch": 2.4941388638412985, "grad_norm": 0.18822742998600006, "learning_rate": 0.0002510791366906475, "loss": 0.7419, "mean_token_accuracy": 0.7890448868274689, "num_tokens": 42439746.0, "step": 693 }, { "entropy": 0.8125, "epoch": 2.497745716862038, "grad_norm": 0.18603956699371338, "learning_rate": 0.00025071942446043164, "loss": 0.745, "mean_token_accuracy": 0.7898269891738892, "num_tokens": 42502186.0, "step": 694 }, { "entropy": 0.787109375, "epoch": 2.501352569882777, "grad_norm": 0.19746778905391693, "learning_rate": 0.0002503597122302158, "loss": 0.7187, "mean_token_accuracy": 0.7969749420881271, "num_tokens": 42562449.0, "step": 695 }, { "entropy": 0.802734375, "epoch": 2.5049594229035166, "grad_norm": 0.20026710629463196, "learning_rate": 0.00025, "loss": 0.7782, "mean_token_accuracy": 0.7816534042358398, "num_tokens": 42623293.0, "step": 696 }, { "entropy": 0.80859375, "epoch": 2.508566275924256, "grad_norm": 0.1875135749578476, "learning_rate": 0.00024964028776978416, "loss": 0.7622, "mean_token_accuracy": 0.7854504138231277, "num_tokens": 42684073.0, "step": 697 }, { "entropy": 0.810546875, "epoch": 2.5121731289449953, "grad_norm": 0.18824613094329834, "learning_rate": 0.00024928057553956837, "loss": 0.7368, "mean_token_accuracy": 0.7897227704524994, "num_tokens": 42744580.0, "step": 698 }, { "entropy": 0.7998046875, "epoch": 2.5157799819657347, "grad_norm": 0.18143995106220245, "learning_rate": 0.0002489208633093525, "loss": 0.7299, "mean_token_accuracy": 0.7920649945735931, "num_tokens": 42806844.0, "step": 699 }, { "entropy": 0.802734375, "epoch": 2.5193868349864745, "grad_norm": 0.18213969469070435, "learning_rate": 0.0002485611510791367, "loss": 0.7306, "mean_token_accuracy": 0.7908004373311996, "num_tokens": 42868129.0, "step": 700 }, { "entropy": 0.8125, "epoch": 2.5229936880072135, "grad_norm": 0.18903671205043793, "learning_rate": 0.0002482014388489209, "loss": 0.7469, "mean_token_accuracy": 0.7863635122776031, "num_tokens": 42930683.0, "step": 701 }, { "entropy": 0.8017578125, "epoch": 2.5266005410279533, "grad_norm": 0.1889694780111313, "learning_rate": 0.00024784172661870504, "loss": 0.7418, "mean_token_accuracy": 0.7872268110513687, "num_tokens": 42991332.0, "step": 702 }, { "entropy": 0.83203125, "epoch": 2.5302073940486927, "grad_norm": 0.19017133116722107, "learning_rate": 0.0002474820143884892, "loss": 0.7785, "mean_token_accuracy": 0.7798624485731125, "num_tokens": 43053652.0, "step": 703 }, { "entropy": 0.8203125, "epoch": 2.533814247069432, "grad_norm": 0.1996258944272995, "learning_rate": 0.0002471223021582734, "loss": 0.7533, "mean_token_accuracy": 0.787124291062355, "num_tokens": 43111425.0, "step": 704 }, { "entropy": 0.783203125, "epoch": 2.5374211000901714, "grad_norm": 0.18286173045635223, "learning_rate": 0.00024676258992805756, "loss": 0.7241, "mean_token_accuracy": 0.7935687154531479, "num_tokens": 43172785.0, "step": 705 }, { "entropy": 0.783203125, "epoch": 2.541027953110911, "grad_norm": 0.18081596493721008, "learning_rate": 0.0002464028776978417, "loss": 0.7254, "mean_token_accuracy": 0.7937474548816681, "num_tokens": 43233191.0, "step": 706 }, { "entropy": 0.7919921875, "epoch": 2.54463480613165, "grad_norm": 0.1905541568994522, "learning_rate": 0.0002460431654676259, "loss": 0.7448, "mean_token_accuracy": 0.7878510802984238, "num_tokens": 43295570.0, "step": 707 }, { "entropy": 0.8115234375, "epoch": 2.5482416591523895, "grad_norm": 0.1963399350643158, "learning_rate": 0.00024568345323741007, "loss": 0.7625, "mean_token_accuracy": 0.7859576642513275, "num_tokens": 43357684.0, "step": 708 }, { "entropy": 0.8046875, "epoch": 2.551848512173129, "grad_norm": 0.18603718280792236, "learning_rate": 0.0002453237410071943, "loss": 0.7298, "mean_token_accuracy": 0.7930157631635666, "num_tokens": 43418672.0, "step": 709 }, { "entropy": 0.810546875, "epoch": 2.5554553651938683, "grad_norm": 0.18686091899871826, "learning_rate": 0.00024496402877697843, "loss": 0.7446, "mean_token_accuracy": 0.7891083061695099, "num_tokens": 43479955.0, "step": 710 }, { "entropy": 0.796875, "epoch": 2.5590622182146077, "grad_norm": 0.18624548614025116, "learning_rate": 0.0002446043165467626, "loss": 0.7209, "mean_token_accuracy": 0.7942554801702499, "num_tokens": 43540542.0, "step": 711 }, { "entropy": 0.810546875, "epoch": 2.562669071235347, "grad_norm": 0.18408162891864777, "learning_rate": 0.0002442446043165468, "loss": 0.7472, "mean_token_accuracy": 0.7888778895139694, "num_tokens": 43602437.0, "step": 712 }, { "entropy": 0.794921875, "epoch": 2.5662759242560864, "grad_norm": 0.19563119113445282, "learning_rate": 0.00024388489208633095, "loss": 0.7452, "mean_token_accuracy": 0.7893655449151993, "num_tokens": 43662748.0, "step": 713 }, { "entropy": 0.751953125, "epoch": 2.569882777276826, "grad_norm": 0.18472838401794434, "learning_rate": 0.0002435251798561151, "loss": 0.6878, "mean_token_accuracy": 0.8033913820981979, "num_tokens": 43725985.0, "step": 714 }, { "entropy": 0.7568359375, "epoch": 2.5734896302975656, "grad_norm": 0.18734216690063477, "learning_rate": 0.0002431654676258993, "loss": 0.7225, "mean_token_accuracy": 0.793334037065506, "num_tokens": 43787661.0, "step": 715 }, { "entropy": 0.7978515625, "epoch": 2.5770964833183045, "grad_norm": 0.1985713541507721, "learning_rate": 0.00024280575539568344, "loss": 0.7491, "mean_token_accuracy": 0.7863568961620331, "num_tokens": 43848765.0, "step": 716 }, { "entropy": 0.8046875, "epoch": 2.5807033363390444, "grad_norm": 0.18662510812282562, "learning_rate": 0.00024244604316546765, "loss": 0.7372, "mean_token_accuracy": 0.7895476818084717, "num_tokens": 43910987.0, "step": 717 }, { "entropy": 0.8251953125, "epoch": 2.5843101893597837, "grad_norm": 0.1837340146303177, "learning_rate": 0.0002420863309352518, "loss": 0.7449, "mean_token_accuracy": 0.7865401655435562, "num_tokens": 43971356.0, "step": 718 }, { "entropy": 0.7998046875, "epoch": 2.587917042380523, "grad_norm": 0.18560951948165894, "learning_rate": 0.00024172661870503598, "loss": 0.7253, "mean_token_accuracy": 0.793497622013092, "num_tokens": 44033271.0, "step": 719 }, { "entropy": 0.8095703125, "epoch": 2.5915238954012625, "grad_norm": 0.18571411073207855, "learning_rate": 0.00024136690647482017, "loss": 0.7485, "mean_token_accuracy": 0.7876099795103073, "num_tokens": 44095007.0, "step": 720 }, { "entropy": 0.7724609375, "epoch": 2.595130748422002, "grad_norm": 0.18888762593269348, "learning_rate": 0.00024100719424460432, "loss": 0.7271, "mean_token_accuracy": 0.797669529914856, "num_tokens": 44156597.0, "step": 721 }, { "entropy": 0.7646484375, "epoch": 2.5987376014427412, "grad_norm": 0.19286960363388062, "learning_rate": 0.0002406474820143885, "loss": 0.7341, "mean_token_accuracy": 0.7908933162689209, "num_tokens": 44219003.0, "step": 722 }, { "entropy": 0.8193359375, "epoch": 2.6023444544634806, "grad_norm": 0.18752166628837585, "learning_rate": 0.00024028776978417266, "loss": 0.7676, "mean_token_accuracy": 0.786169245839119, "num_tokens": 44280555.0, "step": 723 }, { "entropy": 0.8291015625, "epoch": 2.60595130748422, "grad_norm": 0.1956016570329666, "learning_rate": 0.00023992805755395684, "loss": 0.7814, "mean_token_accuracy": 0.7793909758329391, "num_tokens": 44341235.0, "step": 724 }, { "entropy": 0.8095703125, "epoch": 2.6095581605049594, "grad_norm": 0.1808100789785385, "learning_rate": 0.000239568345323741, "loss": 0.7268, "mean_token_accuracy": 0.7953687608242035, "num_tokens": 44402028.0, "step": 725 }, { "entropy": 0.8505859375, "epoch": 2.6131650135256987, "grad_norm": 0.18766848742961884, "learning_rate": 0.0002392086330935252, "loss": 0.7727, "mean_token_accuracy": 0.7833770364522934, "num_tokens": 44464021.0, "step": 726 }, { "entropy": 0.8037109375, "epoch": 2.616771866546438, "grad_norm": 0.1889326125383377, "learning_rate": 0.00023884892086330935, "loss": 0.725, "mean_token_accuracy": 0.7950294315814972, "num_tokens": 44523185.0, "step": 727 }, { "entropy": 0.7841796875, "epoch": 2.6203787195671775, "grad_norm": 0.18878653645515442, "learning_rate": 0.00023848920863309353, "loss": 0.724, "mean_token_accuracy": 0.7916217297315598, "num_tokens": 44586257.0, "step": 728 }, { "entropy": 0.7841796875, "epoch": 2.623985572587917, "grad_norm": 0.19267606735229492, "learning_rate": 0.00023812949640287772, "loss": 0.7283, "mean_token_accuracy": 0.792647585272789, "num_tokens": 44644794.0, "step": 729 }, { "entropy": 0.755859375, "epoch": 2.6275924256086567, "grad_norm": 0.19228503108024597, "learning_rate": 0.00023776978417266187, "loss": 0.7087, "mean_token_accuracy": 0.7992714792490005, "num_tokens": 44707228.0, "step": 730 }, { "entropy": 0.720703125, "epoch": 2.6311992786293956, "grad_norm": 0.19410209357738495, "learning_rate": 0.00023741007194244605, "loss": 0.6754, "mean_token_accuracy": 0.8049658834934235, "num_tokens": 44770644.0, "step": 731 }, { "entropy": 0.7919921875, "epoch": 2.6348061316501354, "grad_norm": 0.2008974403142929, "learning_rate": 0.0002370503597122302, "loss": 0.7449, "mean_token_accuracy": 0.7882468104362488, "num_tokens": 44832616.0, "step": 732 }, { "entropy": 0.798828125, "epoch": 2.638412984670875, "grad_norm": 0.18662530183792114, "learning_rate": 0.0002366906474820144, "loss": 0.7417, "mean_token_accuracy": 0.7877643406391144, "num_tokens": 44894640.0, "step": 733 }, { "entropy": 0.7744140625, "epoch": 2.642019837691614, "grad_norm": 0.18222850561141968, "learning_rate": 0.00023633093525179857, "loss": 0.696, "mean_token_accuracy": 0.7985576689243317, "num_tokens": 44956794.0, "step": 734 }, { "entropy": 0.796875, "epoch": 2.6456266907123536, "grad_norm": 0.1835268884897232, "learning_rate": 0.00023597122302158275, "loss": 0.7281, "mean_token_accuracy": 0.7926303595304489, "num_tokens": 45016475.0, "step": 735 }, { "entropy": 0.806640625, "epoch": 2.649233543733093, "grad_norm": 0.18914930522441864, "learning_rate": 0.00023561151079136693, "loss": 0.7532, "mean_token_accuracy": 0.7870346307754517, "num_tokens": 45076950.0, "step": 736 }, { "entropy": 0.8076171875, "epoch": 2.6528403967538323, "grad_norm": 0.18397071957588196, "learning_rate": 0.00023525179856115109, "loss": 0.7433, "mean_token_accuracy": 0.7878353744745255, "num_tokens": 45138316.0, "step": 737 }, { "entropy": 0.779296875, "epoch": 2.6564472497745717, "grad_norm": 0.19810053706169128, "learning_rate": 0.00023489208633093527, "loss": 0.7323, "mean_token_accuracy": 0.7917691767215729, "num_tokens": 45199361.0, "step": 738 }, { "entropy": 0.79296875, "epoch": 2.660054102795311, "grad_norm": 0.21600043773651123, "learning_rate": 0.00023453237410071942, "loss": 0.7561, "mean_token_accuracy": 0.7879060357809067, "num_tokens": 45260832.0, "step": 739 }, { "entropy": 0.7705078125, "epoch": 2.6636609558160504, "grad_norm": 0.18814897537231445, "learning_rate": 0.0002341726618705036, "loss": 0.7235, "mean_token_accuracy": 0.7941046804189682, "num_tokens": 45324323.0, "step": 740 }, { "entropy": 0.806640625, "epoch": 2.66726780883679, "grad_norm": 0.19024305045604706, "learning_rate": 0.00023381294964028776, "loss": 0.7503, "mean_token_accuracy": 0.7862609773874283, "num_tokens": 45385883.0, "step": 741 }, { "entropy": 0.7861328125, "epoch": 2.670874661857529, "grad_norm": 0.18351052701473236, "learning_rate": 0.00023345323741007194, "loss": 0.7166, "mean_token_accuracy": 0.7947791218757629, "num_tokens": 45447449.0, "step": 742 }, { "entropy": 0.8115234375, "epoch": 2.6744815148782686, "grad_norm": 0.18955351412296295, "learning_rate": 0.00023309352517985612, "loss": 0.7483, "mean_token_accuracy": 0.787923738360405, "num_tokens": 45509214.0, "step": 743 }, { "entropy": 0.8154296875, "epoch": 2.678088367899008, "grad_norm": 0.18804828822612762, "learning_rate": 0.0002327338129496403, "loss": 0.7362, "mean_token_accuracy": 0.7902804017066956, "num_tokens": 45570857.0, "step": 744 }, { "entropy": 0.818359375, "epoch": 2.6816952209197478, "grad_norm": 0.1943105310201645, "learning_rate": 0.00023237410071942448, "loss": 0.7323, "mean_token_accuracy": 0.7925425320863724, "num_tokens": 45630365.0, "step": 745 }, { "entropy": 0.79296875, "epoch": 2.6853020739404867, "grad_norm": 0.1877983808517456, "learning_rate": 0.00023201438848920864, "loss": 0.7438, "mean_token_accuracy": 0.7889807671308517, "num_tokens": 45692735.0, "step": 746 }, { "entropy": 0.7626953125, "epoch": 2.6889089269612265, "grad_norm": 0.19481609761714935, "learning_rate": 0.00023165467625899282, "loss": 0.7165, "mean_token_accuracy": 0.7960446923971176, "num_tokens": 45752837.0, "step": 747 }, { "entropy": 0.7880859375, "epoch": 2.692515779981966, "grad_norm": 0.20884722471237183, "learning_rate": 0.00023129496402877697, "loss": 0.7558, "mean_token_accuracy": 0.7862965613603592, "num_tokens": 45814369.0, "step": 748 }, { "entropy": 0.7880859375, "epoch": 2.6961226330027053, "grad_norm": 0.19201287627220154, "learning_rate": 0.00023093525179856115, "loss": 0.725, "mean_token_accuracy": 0.7939475625753403, "num_tokens": 45874810.0, "step": 749 }, { "entropy": 0.7890625, "epoch": 2.6997294860234446, "grad_norm": 0.18726278841495514, "learning_rate": 0.0002305755395683453, "loss": 0.7097, "mean_token_accuracy": 0.7980274558067322, "num_tokens": 45934536.0, "step": 750 }, { "entropy": 0.802734375, "epoch": 2.703336339044184, "grad_norm": 0.19711144268512726, "learning_rate": 0.00023021582733812951, "loss": 0.7381, "mean_token_accuracy": 0.7906654924154282, "num_tokens": 45994165.0, "step": 751 }, { "entropy": 0.822265625, "epoch": 2.7069431920649234, "grad_norm": 0.18206214904785156, "learning_rate": 0.0002298561151079137, "loss": 0.7401, "mean_token_accuracy": 0.7920153737068176, "num_tokens": 46057445.0, "step": 752 }, { "entropy": 0.787109375, "epoch": 2.7105500450856628, "grad_norm": 0.2000550776720047, "learning_rate": 0.00022949640287769785, "loss": 0.7069, "mean_token_accuracy": 0.7988585084676743, "num_tokens": 46117788.0, "step": 753 }, { "entropy": 0.822265625, "epoch": 2.714156898106402, "grad_norm": 0.19447217881679535, "learning_rate": 0.00022913669064748203, "loss": 0.758, "mean_token_accuracy": 0.7862460315227509, "num_tokens": 46179431.0, "step": 754 }, { "entropy": 0.80859375, "epoch": 2.7177637511271415, "grad_norm": 0.20358894765377045, "learning_rate": 0.00022877697841726619, "loss": 0.7671, "mean_token_accuracy": 0.7836468666791916, "num_tokens": 46241609.0, "step": 755 }, { "entropy": 0.7646484375, "epoch": 2.721370604147881, "grad_norm": 0.18528415262699127, "learning_rate": 0.00022841726618705037, "loss": 0.711, "mean_token_accuracy": 0.7977322190999985, "num_tokens": 46304575.0, "step": 756 }, { "entropy": 0.751953125, "epoch": 2.7249774571686203, "grad_norm": 0.1857839673757553, "learning_rate": 0.00022805755395683452, "loss": 0.7073, "mean_token_accuracy": 0.7987954020500183, "num_tokens": 46366988.0, "step": 757 }, { "entropy": 0.7734375, "epoch": 2.7285843101893597, "grad_norm": 0.19079576432704926, "learning_rate": 0.0002276978417266187, "loss": 0.739, "mean_token_accuracy": 0.7886064052581787, "num_tokens": 46428697.0, "step": 758 }, { "entropy": 0.76953125, "epoch": 2.732191163210099, "grad_norm": 0.2078651785850525, "learning_rate": 0.00022733812949640288, "loss": 0.7288, "mean_token_accuracy": 0.790064737200737, "num_tokens": 46488964.0, "step": 759 }, { "entropy": 0.7958984375, "epoch": 2.735798016230839, "grad_norm": 0.18909852206707, "learning_rate": 0.00022697841726618706, "loss": 0.7403, "mean_token_accuracy": 0.7896477729082108, "num_tokens": 46552005.0, "step": 760 }, { "entropy": 0.78125, "epoch": 2.739404869251578, "grad_norm": 0.18465638160705566, "learning_rate": 0.00022661870503597125, "loss": 0.712, "mean_token_accuracy": 0.7969123274087906, "num_tokens": 46612613.0, "step": 761 }, { "entropy": 0.806640625, "epoch": 2.7430117222723176, "grad_norm": 0.19121190905570984, "learning_rate": 0.0002262589928057554, "loss": 0.7233, "mean_token_accuracy": 0.7921807914972305, "num_tokens": 46673640.0, "step": 762 }, { "entropy": 0.80078125, "epoch": 2.746618575293057, "grad_norm": 0.1835874319076538, "learning_rate": 0.00022589928057553958, "loss": 0.7293, "mean_token_accuracy": 0.7924755066633224, "num_tokens": 46736360.0, "step": 763 }, { "entropy": 0.8251953125, "epoch": 2.7502254283137963, "grad_norm": 0.19050973653793335, "learning_rate": 0.00022553956834532374, "loss": 0.7605, "mean_token_accuracy": 0.7866650372743607, "num_tokens": 46796921.0, "step": 764 }, { "entropy": 0.783203125, "epoch": 2.7538322813345357, "grad_norm": 0.17873775959014893, "learning_rate": 0.00022517985611510792, "loss": 0.7195, "mean_token_accuracy": 0.7962078005075455, "num_tokens": 46857837.0, "step": 765 }, { "entropy": 0.8359375, "epoch": 2.757439134355275, "grad_norm": 0.19745934009552002, "learning_rate": 0.00022482014388489207, "loss": 0.7766, "mean_token_accuracy": 0.7819628864526749, "num_tokens": 46918596.0, "step": 766 }, { "entropy": 0.78515625, "epoch": 2.7610459873760145, "grad_norm": 0.20087561011314392, "learning_rate": 0.00022446043165467625, "loss": 0.752, "mean_token_accuracy": 0.7872990369796753, "num_tokens": 46981115.0, "step": 767 }, { "entropy": 0.765625, "epoch": 2.764652840396754, "grad_norm": 0.18688799440860748, "learning_rate": 0.00022410071942446046, "loss": 0.7078, "mean_token_accuracy": 0.7980319112539291, "num_tokens": 47043278.0, "step": 768 }, { "entropy": 0.7841796875, "epoch": 2.7682596934174932, "grad_norm": 0.18861721456050873, "learning_rate": 0.00022374100719424461, "loss": 0.73, "mean_token_accuracy": 0.7919085770845413, "num_tokens": 47106243.0, "step": 769 }, { "entropy": 0.7587890625, "epoch": 2.7718665464382326, "grad_norm": 0.1942640244960785, "learning_rate": 0.0002233812949640288, "loss": 0.7008, "mean_token_accuracy": 0.7985814660787582, "num_tokens": 47166972.0, "step": 770 }, { "entropy": 0.80859375, "epoch": 2.775473399458972, "grad_norm": 0.19008766114711761, "learning_rate": 0.00022302158273381295, "loss": 0.7351, "mean_token_accuracy": 0.7961627095937729, "num_tokens": 47228337.0, "step": 771 }, { "entropy": 0.7734375, "epoch": 2.7790802524797114, "grad_norm": 0.1897522211074829, "learning_rate": 0.00022266187050359713, "loss": 0.7125, "mean_token_accuracy": 0.7974579185247421, "num_tokens": 47287821.0, "step": 772 }, { "entropy": 0.759765625, "epoch": 2.7826871055004507, "grad_norm": 0.18404437601566315, "learning_rate": 0.00022230215827338129, "loss": 0.6931, "mean_token_accuracy": 0.8017688542604446, "num_tokens": 47351300.0, "step": 773 }, { "entropy": 0.7880859375, "epoch": 2.78629395852119, "grad_norm": 0.19722850620746613, "learning_rate": 0.00022194244604316547, "loss": 0.7347, "mean_token_accuracy": 0.7915790379047394, "num_tokens": 47412429.0, "step": 774 }, { "entropy": 0.7861328125, "epoch": 2.78990081154193, "grad_norm": 0.1912929266691208, "learning_rate": 0.00022158273381294962, "loss": 0.7231, "mean_token_accuracy": 0.7944567948579788, "num_tokens": 47475084.0, "step": 775 }, { "entropy": 0.78125, "epoch": 2.793507664562669, "grad_norm": 0.1926320344209671, "learning_rate": 0.00022122302158273383, "loss": 0.7065, "mean_token_accuracy": 0.797608345746994, "num_tokens": 47534720.0, "step": 776 }, { "entropy": 0.7353515625, "epoch": 2.7971145175834087, "grad_norm": 0.18497876822948456, "learning_rate": 0.000220863309352518, "loss": 0.688, "mean_token_accuracy": 0.8038123100996017, "num_tokens": 47596955.0, "step": 777 }, { "entropy": 0.75, "epoch": 2.8007213706041476, "grad_norm": 0.19857345521450043, "learning_rate": 0.00022050359712230216, "loss": 0.7058, "mean_token_accuracy": 0.7980986088514328, "num_tokens": 47659283.0, "step": 778 }, { "entropy": 0.7724609375, "epoch": 2.8043282236248874, "grad_norm": 0.19520068168640137, "learning_rate": 0.00022014388489208635, "loss": 0.702, "mean_token_accuracy": 0.799936830997467, "num_tokens": 47718919.0, "step": 779 }, { "entropy": 0.771484375, "epoch": 2.807935076645627, "grad_norm": 0.18946218490600586, "learning_rate": 0.0002197841726618705, "loss": 0.705, "mean_token_accuracy": 0.7979218810796738, "num_tokens": 47781653.0, "step": 780 }, { "entropy": 0.78515625, "epoch": 2.811541929666366, "grad_norm": 0.1970129758119583, "learning_rate": 0.00021942446043165468, "loss": 0.726, "mean_token_accuracy": 0.7950309216976166, "num_tokens": 47843536.0, "step": 781 }, { "entropy": 0.7802734375, "epoch": 2.8151487826871056, "grad_norm": 0.18409499526023865, "learning_rate": 0.00021906474820143884, "loss": 0.7073, "mean_token_accuracy": 0.797737643122673, "num_tokens": 47905520.0, "step": 782 }, { "entropy": 0.7861328125, "epoch": 2.818755635707845, "grad_norm": 0.18691697716712952, "learning_rate": 0.00021870503597122302, "loss": 0.7291, "mean_token_accuracy": 0.7941122502088547, "num_tokens": 47966300.0, "step": 783 }, { "entropy": 0.794921875, "epoch": 2.8223624887285843, "grad_norm": 0.18886737525463104, "learning_rate": 0.00021834532374100723, "loss": 0.7474, "mean_token_accuracy": 0.7873400747776031, "num_tokens": 48028670.0, "step": 784 }, { "entropy": 0.7783203125, "epoch": 2.8259693417493237, "grad_norm": 0.1982373595237732, "learning_rate": 0.00021798561151079138, "loss": 0.7395, "mean_token_accuracy": 0.7903161942958832, "num_tokens": 48089954.0, "step": 785 }, { "entropy": 0.79296875, "epoch": 2.829576194770063, "grad_norm": 0.19699576497077942, "learning_rate": 0.00021762589928057556, "loss": 0.7436, "mean_token_accuracy": 0.7893838733434677, "num_tokens": 48151039.0, "step": 786 }, { "entropy": 0.798828125, "epoch": 2.8331830477908024, "grad_norm": 0.1849612444639206, "learning_rate": 0.00021726618705035972, "loss": 0.736, "mean_token_accuracy": 0.790994182229042, "num_tokens": 48214011.0, "step": 787 }, { "entropy": 0.796875, "epoch": 2.836789900811542, "grad_norm": 0.18821904063224792, "learning_rate": 0.0002169064748201439, "loss": 0.7283, "mean_token_accuracy": 0.7936753928661346, "num_tokens": 48275781.0, "step": 788 }, { "entropy": 0.79296875, "epoch": 2.840396753832281, "grad_norm": 0.19064773619174957, "learning_rate": 0.00021654676258992805, "loss": 0.7243, "mean_token_accuracy": 0.7963396310806274, "num_tokens": 48337650.0, "step": 789 }, { "entropy": 0.7900390625, "epoch": 2.844003606853021, "grad_norm": 0.18892619013786316, "learning_rate": 0.00021618705035971223, "loss": 0.7304, "mean_token_accuracy": 0.7916672825813293, "num_tokens": 48397913.0, "step": 790 }, { "entropy": 0.7802734375, "epoch": 2.84761045987376, "grad_norm": 0.19981788098812103, "learning_rate": 0.00021582733812949639, "loss": 0.7097, "mean_token_accuracy": 0.7970959097146988, "num_tokens": 48460371.0, "step": 791 }, { "entropy": 0.7783203125, "epoch": 2.8512173128944998, "grad_norm": 0.18836061656475067, "learning_rate": 0.00021546762589928057, "loss": 0.7153, "mean_token_accuracy": 0.7969343364238739, "num_tokens": 48519196.0, "step": 792 }, { "entropy": 0.7685546875, "epoch": 2.8548241659152387, "grad_norm": 0.18480141460895538, "learning_rate": 0.00021510791366906478, "loss": 0.705, "mean_token_accuracy": 0.7994607836008072, "num_tokens": 48580853.0, "step": 793 }, { "entropy": 0.7783203125, "epoch": 2.8584310189359785, "grad_norm": 0.21744844317436218, "learning_rate": 0.00021474820143884893, "loss": 0.73, "mean_token_accuracy": 0.7906037271022797, "num_tokens": 48640378.0, "step": 794 }, { "entropy": 0.7626953125, "epoch": 2.862037871956718, "grad_norm": 0.19358819723129272, "learning_rate": 0.0002143884892086331, "loss": 0.7072, "mean_token_accuracy": 0.797954872250557, "num_tokens": 48702555.0, "step": 795 }, { "entropy": 0.7607421875, "epoch": 2.8656447249774573, "grad_norm": 0.19181716442108154, "learning_rate": 0.00021402877697841727, "loss": 0.7068, "mean_token_accuracy": 0.8002966791391373, "num_tokens": 48764449.0, "step": 796 }, { "entropy": 0.7646484375, "epoch": 2.8692515779981966, "grad_norm": 0.18241184949874878, "learning_rate": 0.00021366906474820145, "loss": 0.6934, "mean_token_accuracy": 0.8022985756397247, "num_tokens": 48827728.0, "step": 797 }, { "entropy": 0.8056640625, "epoch": 2.872858431018936, "grad_norm": 0.188787579536438, "learning_rate": 0.0002133093525179856, "loss": 0.749, "mean_token_accuracy": 0.7894743382930756, "num_tokens": 48889316.0, "step": 798 }, { "entropy": 0.794921875, "epoch": 2.8764652840396754, "grad_norm": 0.1910216212272644, "learning_rate": 0.00021294964028776978, "loss": 0.7326, "mean_token_accuracy": 0.7905001640319824, "num_tokens": 48949006.0, "step": 799 }, { "entropy": 0.78515625, "epoch": 2.8800721370604148, "grad_norm": 0.19157126545906067, "learning_rate": 0.00021258992805755394, "loss": 0.7304, "mean_token_accuracy": 0.7912836074829102, "num_tokens": 49009320.0, "step": 800 }, { "entropy": 0.77734375, "epoch": 2.883678990081154, "grad_norm": 0.19860349595546722, "learning_rate": 0.00021223021582733814, "loss": 0.7138, "mean_token_accuracy": 0.7988200634717941, "num_tokens": 49070770.0, "step": 801 }, { "entropy": 0.7705078125, "epoch": 2.8872858431018935, "grad_norm": 0.18240632116794586, "learning_rate": 0.00021187050359712233, "loss": 0.7132, "mean_token_accuracy": 0.7971154153347015, "num_tokens": 49131922.0, "step": 802 }, { "entropy": 0.767578125, "epoch": 2.890892696122633, "grad_norm": 0.1990460306406021, "learning_rate": 0.00021151079136690648, "loss": 0.7187, "mean_token_accuracy": 0.7961603403091431, "num_tokens": 49193396.0, "step": 803 }, { "entropy": 0.7548828125, "epoch": 2.8944995491433723, "grad_norm": 0.19559602439403534, "learning_rate": 0.00021115107913669066, "loss": 0.6971, "mean_token_accuracy": 0.7984348833560944, "num_tokens": 49255134.0, "step": 804 }, { "entropy": 0.7626953125, "epoch": 2.898106402164112, "grad_norm": 0.19437150657176971, "learning_rate": 0.00021079136690647482, "loss": 0.7108, "mean_token_accuracy": 0.7980929762125015, "num_tokens": 49317398.0, "step": 805 }, { "entropy": 0.7802734375, "epoch": 2.901713255184851, "grad_norm": 0.18890510499477386, "learning_rate": 0.000210431654676259, "loss": 0.722, "mean_token_accuracy": 0.7957810163497925, "num_tokens": 49380555.0, "step": 806 }, { "entropy": 0.7802734375, "epoch": 2.905320108205591, "grad_norm": 0.19144722819328308, "learning_rate": 0.00021007194244604315, "loss": 0.7222, "mean_token_accuracy": 0.793603852391243, "num_tokens": 49440798.0, "step": 807 }, { "entropy": 0.783203125, "epoch": 2.9089269612263298, "grad_norm": 0.18837811052799225, "learning_rate": 0.00020971223021582733, "loss": 0.7132, "mean_token_accuracy": 0.7962876409292221, "num_tokens": 49502465.0, "step": 808 }, { "entropy": 0.771484375, "epoch": 2.9125338142470696, "grad_norm": 0.1931532472372055, "learning_rate": 0.0002093525179856115, "loss": 0.7077, "mean_token_accuracy": 0.7997084259986877, "num_tokens": 49564250.0, "step": 809 }, { "entropy": 0.78515625, "epoch": 2.916140667267809, "grad_norm": 0.1931878924369812, "learning_rate": 0.0002089928057553957, "loss": 0.7305, "mean_token_accuracy": 0.7930234670639038, "num_tokens": 49626197.0, "step": 810 }, { "entropy": 0.765625, "epoch": 2.9197475202885483, "grad_norm": 0.18990421295166016, "learning_rate": 0.00020863309352517988, "loss": 0.7096, "mean_token_accuracy": 0.8005110770463943, "num_tokens": 49688742.0, "step": 811 }, { "entropy": 0.7685546875, "epoch": 2.9233543733092877, "grad_norm": 0.19004672765731812, "learning_rate": 0.00020827338129496403, "loss": 0.7145, "mean_token_accuracy": 0.7978890538215637, "num_tokens": 49749918.0, "step": 812 }, { "entropy": 0.7666015625, "epoch": 2.926961226330027, "grad_norm": 0.19058051705360413, "learning_rate": 0.0002079136690647482, "loss": 0.7082, "mean_token_accuracy": 0.7985552847385406, "num_tokens": 49811842.0, "step": 813 }, { "entropy": 0.78515625, "epoch": 2.9305680793507665, "grad_norm": 0.20216943323612213, "learning_rate": 0.00020755395683453237, "loss": 0.7454, "mean_token_accuracy": 0.7899932414293289, "num_tokens": 49873867.0, "step": 814 }, { "entropy": 0.7666015625, "epoch": 2.934174932371506, "grad_norm": 0.1897672861814499, "learning_rate": 0.00020719424460431655, "loss": 0.6976, "mean_token_accuracy": 0.8013168126344681, "num_tokens": 49934526.0, "step": 815 }, { "entropy": 0.78125, "epoch": 2.937781785392245, "grad_norm": 0.19361715018749237, "learning_rate": 0.00020683453237410073, "loss": 0.7136, "mean_token_accuracy": 0.7976363748311996, "num_tokens": 49995987.0, "step": 816 }, { "entropy": 0.7822265625, "epoch": 2.9413886384129846, "grad_norm": 0.18298502266407013, "learning_rate": 0.00020647482014388488, "loss": 0.7275, "mean_token_accuracy": 0.7932883650064468, "num_tokens": 50059031.0, "step": 817 }, { "entropy": 0.791015625, "epoch": 2.944995491433724, "grad_norm": 0.19138944149017334, "learning_rate": 0.0002061151079136691, "loss": 0.7365, "mean_token_accuracy": 0.789290577173233, "num_tokens": 50122960.0, "step": 818 }, { "entropy": 0.7783203125, "epoch": 2.9486023444544633, "grad_norm": 0.18651758134365082, "learning_rate": 0.00020575539568345324, "loss": 0.7063, "mean_token_accuracy": 0.800441563129425, "num_tokens": 50184193.0, "step": 819 }, { "entropy": 0.7705078125, "epoch": 2.952209197475203, "grad_norm": 0.19058655202388763, "learning_rate": 0.00020539568345323743, "loss": 0.7079, "mean_token_accuracy": 0.8003310561180115, "num_tokens": 50244156.0, "step": 820 }, { "entropy": 0.77734375, "epoch": 2.955816050495942, "grad_norm": 0.19263453781604767, "learning_rate": 0.00020503597122302158, "loss": 0.7152, "mean_token_accuracy": 0.7970972210168839, "num_tokens": 50306155.0, "step": 821 }, { "entropy": 0.775390625, "epoch": 2.959422903516682, "grad_norm": 0.19568851590156555, "learning_rate": 0.00020467625899280576, "loss": 0.7167, "mean_token_accuracy": 0.7985122203826904, "num_tokens": 50367305.0, "step": 822 }, { "entropy": 0.751953125, "epoch": 2.963029756537421, "grad_norm": 0.1969912052154541, "learning_rate": 0.00020431654676258992, "loss": 0.7044, "mean_token_accuracy": 0.7998211234807968, "num_tokens": 50428744.0, "step": 823 }, { "entropy": 0.7783203125, "epoch": 2.9666366095581607, "grad_norm": 0.1954231858253479, "learning_rate": 0.0002039568345323741, "loss": 0.7181, "mean_token_accuracy": 0.7996009290218353, "num_tokens": 50490865.0, "step": 824 }, { "entropy": 0.7998046875, "epoch": 2.9702434625789, "grad_norm": 0.2009086310863495, "learning_rate": 0.00020359712230215828, "loss": 0.7578, "mean_token_accuracy": 0.7874416708946228, "num_tokens": 50551046.0, "step": 825 }, { "entropy": 0.7626953125, "epoch": 2.9738503155996394, "grad_norm": 0.19157607853412628, "learning_rate": 0.00020323741007194246, "loss": 0.7036, "mean_token_accuracy": 0.7999936938285828, "num_tokens": 50612551.0, "step": 826 }, { "entropy": 0.7841796875, "epoch": 2.977457168620379, "grad_norm": 0.18839552998542786, "learning_rate": 0.00020287769784172664, "loss": 0.7271, "mean_token_accuracy": 0.7943488508462906, "num_tokens": 50673743.0, "step": 827 }, { "entropy": 0.78515625, "epoch": 2.981064021641118, "grad_norm": 0.1896384358406067, "learning_rate": 0.0002025179856115108, "loss": 0.7259, "mean_token_accuracy": 0.7944885641336441, "num_tokens": 50735194.0, "step": 828 }, { "entropy": 0.7724609375, "epoch": 2.9846708746618575, "grad_norm": 0.18864727020263672, "learning_rate": 0.00020215827338129498, "loss": 0.7064, "mean_token_accuracy": 0.7992347925901413, "num_tokens": 50797401.0, "step": 829 }, { "entropy": 0.7802734375, "epoch": 2.988277727682597, "grad_norm": 0.18989479541778564, "learning_rate": 0.00020179856115107913, "loss": 0.7106, "mean_token_accuracy": 0.7961680889129639, "num_tokens": 50856130.0, "step": 830 }, { "entropy": 0.779296875, "epoch": 2.9918845807033363, "grad_norm": 0.186777263879776, "learning_rate": 0.0002014388489208633, "loss": 0.7127, "mean_token_accuracy": 0.796418160200119, "num_tokens": 50917491.0, "step": 831 }, { "entropy": 0.7900390625, "epoch": 2.9954914337240757, "grad_norm": 0.1882639080286026, "learning_rate": 0.0002010791366906475, "loss": 0.7155, "mean_token_accuracy": 0.7967613935470581, "num_tokens": 50978629.0, "step": 832 }, { "entropy": 0.7470703125, "epoch": 2.999098286744815, "grad_norm": 0.19276052713394165, "learning_rate": 0.00020071942446043165, "loss": 0.6776, "mean_token_accuracy": 0.8048471957445145, "num_tokens": 51041727.0, "step": 833 }, { "entropy": 0.7109375, "epoch": 3.0, "grad_norm": 0.35983923077583313, "learning_rate": 0.00020035971223021583, "loss": 0.6418, "mean_token_accuracy": 0.8159685134887695, "num_tokens": 51056723.0, "step": 834 }, { "epoch": 3.0, "eval_entropy": 0.87890625, "eval_loss": 1.0363644361495972, "eval_mean_token_accuracy": 0.7278320491313934, "eval_num_tokens": 51056723.0, "eval_runtime": 1.9836, "eval_samples_per_second": 25.207, "eval_steps_per_second": 1.008, "step": 834 } ], "logging_steps": 1.0, "max_steps": 1390, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5813410720658227e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }