{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.983818770226537, "eval_steps": 500, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006472491909385114, "grad_norm": 5.30616569519043, "learning_rate": 0.0, "loss": 1.1501, "num_tokens": 963559.0, "step": 1 }, { "epoch": 0.012944983818770227, "grad_norm": 5.308490753173828, "learning_rate": 1.0638297872340427e-06, "loss": 1.1455, "num_tokens": 1927760.0, "step": 2 }, { "epoch": 0.019417475728155338, "grad_norm": 5.182309150695801, "learning_rate": 2.1276595744680853e-06, "loss": 1.1344, "num_tokens": 2896369.0, "step": 3 }, { "epoch": 0.025889967637540454, "grad_norm": 4.847397804260254, "learning_rate": 3.1914893617021277e-06, "loss": 1.1109, "num_tokens": 3886583.0, "step": 4 }, { "epoch": 0.032362459546925564, "grad_norm": 4.195371627807617, "learning_rate": 4.255319148936171e-06, "loss": 1.0588, "num_tokens": 4864476.0, "step": 5 }, { "epoch": 0.038834951456310676, "grad_norm": 3.016223669052124, "learning_rate": 5.319148936170213e-06, "loss": 0.9651, "num_tokens": 5873295.0, "step": 6 }, { "epoch": 0.045307443365695796, "grad_norm": 1.8406875133514404, "learning_rate": 6.3829787234042555e-06, "loss": 0.8156, "num_tokens": 6876484.0, "step": 7 }, { "epoch": 0.05177993527508091, "grad_norm": 1.5402626991271973, "learning_rate": 7.446808510638298e-06, "loss": 0.7824, "num_tokens": 7871664.0, "step": 8 }, { "epoch": 0.05825242718446602, "grad_norm": 1.298897385597229, "learning_rate": 8.510638297872341e-06, "loss": 0.7578, "num_tokens": 8869931.0, "step": 9 }, { "epoch": 0.06472491909385113, "grad_norm": 1.8411380052566528, "learning_rate": 9.574468085106383e-06, "loss": 0.7185, "num_tokens": 9874687.0, "step": 10 }, { "epoch": 0.07119741100323625, "grad_norm": 1.200448989868164, "learning_rate": 1.0638297872340426e-05, "loss": 0.6871, "num_tokens": 10842628.0, "step": 11 }, { "epoch": 0.07766990291262135, "grad_norm": 0.9098120331764221, "learning_rate": 1.170212765957447e-05, "loss": 0.6798, "num_tokens": 11815438.0, "step": 12 }, { "epoch": 0.08414239482200647, "grad_norm": 0.7887447476387024, "learning_rate": 1.2765957446808511e-05, "loss": 0.6431, "num_tokens": 12788222.0, "step": 13 }, { "epoch": 0.09061488673139159, "grad_norm": 0.6693373322486877, "learning_rate": 1.3829787234042554e-05, "loss": 0.6288, "num_tokens": 13756270.0, "step": 14 }, { "epoch": 0.0970873786407767, "grad_norm": 0.4491701126098633, "learning_rate": 1.4893617021276596e-05, "loss": 0.6025, "num_tokens": 14689813.0, "step": 15 }, { "epoch": 0.10355987055016182, "grad_norm": 0.37262922525405884, "learning_rate": 1.595744680851064e-05, "loss": 0.592, "num_tokens": 15670657.0, "step": 16 }, { "epoch": 0.11003236245954692, "grad_norm": 0.3815455138683319, "learning_rate": 1.7021276595744682e-05, "loss": 0.5745, "num_tokens": 16640491.0, "step": 17 }, { "epoch": 0.11650485436893204, "grad_norm": 0.35885313153266907, "learning_rate": 1.8085106382978724e-05, "loss": 0.567, "num_tokens": 17611388.0, "step": 18 }, { "epoch": 0.12297734627831715, "grad_norm": 0.28868967294692993, "learning_rate": 1.9148936170212766e-05, "loss": 0.56, "num_tokens": 18582653.0, "step": 19 }, { "epoch": 0.12944983818770225, "grad_norm": 0.2990097105503082, "learning_rate": 2.0212765957446807e-05, "loss": 0.5475, "num_tokens": 19557459.0, "step": 20 }, { "epoch": 0.13592233009708737, "grad_norm": 0.24530692398548126, "learning_rate": 2.1276595744680852e-05, "loss": 0.546, "num_tokens": 20524158.0, "step": 21 }, { "epoch": 0.1423948220064725, "grad_norm": 0.26838523149490356, "learning_rate": 2.2340425531914894e-05, "loss": 0.5278, "num_tokens": 21502674.0, "step": 22 }, { "epoch": 0.1488673139158576, "grad_norm": 0.24428600072860718, "learning_rate": 2.340425531914894e-05, "loss": 0.5243, "num_tokens": 22446294.0, "step": 23 }, { "epoch": 0.1553398058252427, "grad_norm": 0.22131946682929993, "learning_rate": 2.446808510638298e-05, "loss": 0.5166, "num_tokens": 23430225.0, "step": 24 }, { "epoch": 0.16181229773462782, "grad_norm": 0.21418221294879913, "learning_rate": 2.5531914893617022e-05, "loss": 0.5105, "num_tokens": 24380419.0, "step": 25 }, { "epoch": 0.16828478964401294, "grad_norm": 0.22137139737606049, "learning_rate": 2.6595744680851064e-05, "loss": 0.5016, "num_tokens": 25343499.0, "step": 26 }, { "epoch": 0.17475728155339806, "grad_norm": 0.23214539885520935, "learning_rate": 2.765957446808511e-05, "loss": 0.4979, "num_tokens": 26315921.0, "step": 27 }, { "epoch": 0.18122977346278318, "grad_norm": 0.20767340064048767, "learning_rate": 2.8723404255319154e-05, "loss": 0.5015, "num_tokens": 27255686.0, "step": 28 }, { "epoch": 0.18770226537216828, "grad_norm": 0.24059996008872986, "learning_rate": 2.9787234042553192e-05, "loss": 0.4933, "num_tokens": 28232449.0, "step": 29 }, { "epoch": 0.1941747572815534, "grad_norm": 0.2784816324710846, "learning_rate": 3.085106382978723e-05, "loss": 0.4902, "num_tokens": 29213099.0, "step": 30 }, { "epoch": 0.20064724919093851, "grad_norm": 0.26755547523498535, "learning_rate": 3.191489361702128e-05, "loss": 0.4888, "num_tokens": 30210546.0, "step": 31 }, { "epoch": 0.20711974110032363, "grad_norm": 0.23981450498104095, "learning_rate": 3.2978723404255317e-05, "loss": 0.476, "num_tokens": 31190215.0, "step": 32 }, { "epoch": 0.21359223300970873, "grad_norm": 0.21546247601509094, "learning_rate": 3.4042553191489365e-05, "loss": 0.4723, "num_tokens": 32152368.0, "step": 33 }, { "epoch": 0.22006472491909385, "grad_norm": 0.2640170156955719, "learning_rate": 3.5106382978723407e-05, "loss": 0.4811, "num_tokens": 33138410.0, "step": 34 }, { "epoch": 0.22653721682847897, "grad_norm": 0.2773108184337616, "learning_rate": 3.617021276595745e-05, "loss": 0.4732, "num_tokens": 34103559.0, "step": 35 }, { "epoch": 0.23300970873786409, "grad_norm": 0.30199486017227173, "learning_rate": 3.723404255319149e-05, "loss": 0.4751, "num_tokens": 35099702.0, "step": 36 }, { "epoch": 0.23948220064724918, "grad_norm": 0.3108445107936859, "learning_rate": 3.829787234042553e-05, "loss": 0.4752, "num_tokens": 36067246.0, "step": 37 }, { "epoch": 0.2459546925566343, "grad_norm": 0.20293442904949188, "learning_rate": 3.936170212765958e-05, "loss": 0.4629, "num_tokens": 37074157.0, "step": 38 }, { "epoch": 0.2524271844660194, "grad_norm": 0.28514423966407776, "learning_rate": 4.0425531914893614e-05, "loss": 0.461, "num_tokens": 38026864.0, "step": 39 }, { "epoch": 0.2588996763754045, "grad_norm": 0.2532414197921753, "learning_rate": 4.148936170212766e-05, "loss": 0.4559, "num_tokens": 39007950.0, "step": 40 }, { "epoch": 0.26537216828478966, "grad_norm": 0.2653239965438843, "learning_rate": 4.2553191489361704e-05, "loss": 0.4636, "num_tokens": 40004404.0, "step": 41 }, { "epoch": 0.27184466019417475, "grad_norm": 0.26811864972114563, "learning_rate": 4.3617021276595746e-05, "loss": 0.4582, "num_tokens": 40979367.0, "step": 42 }, { "epoch": 0.2783171521035599, "grad_norm": 0.21482442319393158, "learning_rate": 4.468085106382979e-05, "loss": 0.4555, "num_tokens": 41954716.0, "step": 43 }, { "epoch": 0.284789644012945, "grad_norm": 0.3701953589916229, "learning_rate": 4.574468085106383e-05, "loss": 0.4645, "num_tokens": 42927994.0, "step": 44 }, { "epoch": 0.2912621359223301, "grad_norm": 0.21789038181304932, "learning_rate": 4.680851063829788e-05, "loss": 0.4517, "num_tokens": 43870458.0, "step": 45 }, { "epoch": 0.2977346278317152, "grad_norm": 0.265918493270874, "learning_rate": 4.787234042553192e-05, "loss": 0.4523, "num_tokens": 44845205.0, "step": 46 }, { "epoch": 0.3042071197411003, "grad_norm": 0.3400862216949463, "learning_rate": 4.893617021276596e-05, "loss": 0.4504, "num_tokens": 45817785.0, "step": 47 }, { "epoch": 0.3106796116504854, "grad_norm": 0.22314198315143585, "learning_rate": 5e-05, "loss": 0.4528, "num_tokens": 46768552.0, "step": 48 }, { "epoch": 0.31715210355987056, "grad_norm": 0.2896440625190735, "learning_rate": 5e-05, "loss": 0.4382, "num_tokens": 47720390.0, "step": 49 }, { "epoch": 0.32362459546925565, "grad_norm": 0.24954937398433685, "learning_rate": 5e-05, "loss": 0.4478, "num_tokens": 48659119.0, "step": 50 }, { "epoch": 0.3300970873786408, "grad_norm": 0.2640365958213806, "learning_rate": 5e-05, "loss": 0.4471, "num_tokens": 49635896.0, "step": 51 }, { "epoch": 0.3365695792880259, "grad_norm": 0.2811153829097748, "learning_rate": 5e-05, "loss": 0.4468, "num_tokens": 50617481.0, "step": 52 }, { "epoch": 0.343042071197411, "grad_norm": 0.3876810669898987, "learning_rate": 5e-05, "loss": 0.4439, "num_tokens": 51580322.0, "step": 53 }, { "epoch": 0.34951456310679613, "grad_norm": 0.317444771528244, "learning_rate": 5e-05, "loss": 0.4463, "num_tokens": 52544983.0, "step": 54 }, { "epoch": 0.3559870550161812, "grad_norm": 0.27956002950668335, "learning_rate": 5e-05, "loss": 0.4377, "num_tokens": 53538431.0, "step": 55 }, { "epoch": 0.36245954692556637, "grad_norm": 0.27019619941711426, "learning_rate": 5e-05, "loss": 0.4386, "num_tokens": 54497097.0, "step": 56 }, { "epoch": 0.36893203883495146, "grad_norm": 0.2730930745601654, "learning_rate": 5e-05, "loss": 0.4288, "num_tokens": 55515695.0, "step": 57 }, { "epoch": 0.37540453074433655, "grad_norm": 0.35878482460975647, "learning_rate": 5e-05, "loss": 0.4322, "num_tokens": 56471100.0, "step": 58 }, { "epoch": 0.3818770226537217, "grad_norm": 0.2792416214942932, "learning_rate": 5e-05, "loss": 0.4364, "num_tokens": 57460948.0, "step": 59 }, { "epoch": 0.3883495145631068, "grad_norm": 0.3022445738315582, "learning_rate": 5e-05, "loss": 0.4365, "num_tokens": 58450663.0, "step": 60 }, { "epoch": 0.3948220064724919, "grad_norm": 0.23577195405960083, "learning_rate": 5e-05, "loss": 0.4292, "num_tokens": 59407638.0, "step": 61 }, { "epoch": 0.40129449838187703, "grad_norm": 0.25552427768707275, "learning_rate": 5e-05, "loss": 0.4247, "num_tokens": 60358388.0, "step": 62 }, { "epoch": 0.4077669902912621, "grad_norm": 0.21623407304286957, "learning_rate": 5e-05, "loss": 0.4207, "num_tokens": 61306137.0, "step": 63 }, { "epoch": 0.41423948220064727, "grad_norm": 0.2959029972553253, "learning_rate": 5e-05, "loss": 0.4331, "num_tokens": 62298616.0, "step": 64 }, { "epoch": 0.42071197411003236, "grad_norm": 0.3045353591442108, "learning_rate": 5e-05, "loss": 0.4237, "num_tokens": 63242620.0, "step": 65 }, { "epoch": 0.42718446601941745, "grad_norm": 0.1943003237247467, "learning_rate": 5e-05, "loss": 0.4375, "num_tokens": 64231555.0, "step": 66 }, { "epoch": 0.4336569579288026, "grad_norm": 0.23736479878425598, "learning_rate": 5e-05, "loss": 0.4267, "num_tokens": 65190302.0, "step": 67 }, { "epoch": 0.4401294498381877, "grad_norm": 0.23170194029808044, "learning_rate": 5e-05, "loss": 0.4264, "num_tokens": 66175194.0, "step": 68 }, { "epoch": 0.44660194174757284, "grad_norm": 0.2706851661205292, "learning_rate": 5e-05, "loss": 0.4229, "num_tokens": 67146659.0, "step": 69 }, { "epoch": 0.45307443365695793, "grad_norm": 0.30120500922203064, "learning_rate": 5e-05, "loss": 0.4088, "num_tokens": 68115421.0, "step": 70 }, { "epoch": 0.459546925566343, "grad_norm": 0.2372325211763382, "learning_rate": 5e-05, "loss": 0.4069, "num_tokens": 69078962.0, "step": 71 }, { "epoch": 0.46601941747572817, "grad_norm": 0.2547263205051422, "learning_rate": 5e-05, "loss": 0.4312, "num_tokens": 70056552.0, "step": 72 }, { "epoch": 0.47249190938511326, "grad_norm": 0.24434395134449005, "learning_rate": 5e-05, "loss": 0.4243, "num_tokens": 71056405.0, "step": 73 }, { "epoch": 0.47896440129449835, "grad_norm": 0.2492789626121521, "learning_rate": 5e-05, "loss": 0.4344, "num_tokens": 72011963.0, "step": 74 }, { "epoch": 0.4854368932038835, "grad_norm": 0.222487673163414, "learning_rate": 5e-05, "loss": 0.4208, "num_tokens": 72974088.0, "step": 75 }, { "epoch": 0.4919093851132686, "grad_norm": 0.23287588357925415, "learning_rate": 5e-05, "loss": 0.4169, "num_tokens": 73963066.0, "step": 76 }, { "epoch": 0.49838187702265374, "grad_norm": 0.2648158669471741, "learning_rate": 5e-05, "loss": 0.4112, "num_tokens": 74927507.0, "step": 77 }, { "epoch": 0.5048543689320388, "grad_norm": 0.29371848702430725, "learning_rate": 5e-05, "loss": 0.4161, "num_tokens": 75913302.0, "step": 78 }, { "epoch": 0.511326860841424, "grad_norm": 0.2672727108001709, "learning_rate": 5e-05, "loss": 0.425, "num_tokens": 76871637.0, "step": 79 }, { "epoch": 0.517799352750809, "grad_norm": 0.19733232259750366, "learning_rate": 5e-05, "loss": 0.4177, "num_tokens": 77844738.0, "step": 80 }, { "epoch": 0.5242718446601942, "grad_norm": 0.239099383354187, "learning_rate": 5e-05, "loss": 0.4116, "num_tokens": 78789735.0, "step": 81 }, { "epoch": 0.5307443365695793, "grad_norm": 0.2799425721168518, "learning_rate": 5e-05, "loss": 0.4094, "num_tokens": 79740057.0, "step": 82 }, { "epoch": 0.5372168284789643, "grad_norm": 0.19828742742538452, "learning_rate": 5e-05, "loss": 0.4156, "num_tokens": 80691344.0, "step": 83 }, { "epoch": 0.5436893203883495, "grad_norm": 0.2844972014427185, "learning_rate": 5e-05, "loss": 0.417, "num_tokens": 81695065.0, "step": 84 }, { "epoch": 0.5501618122977346, "grad_norm": 0.23156537115573883, "learning_rate": 5e-05, "loss": 0.4135, "num_tokens": 82670591.0, "step": 85 }, { "epoch": 0.5566343042071198, "grad_norm": 0.22037866711616516, "learning_rate": 5e-05, "loss": 0.4127, "num_tokens": 83619525.0, "step": 86 }, { "epoch": 0.5631067961165048, "grad_norm": 0.26530957221984863, "learning_rate": 5e-05, "loss": 0.413, "num_tokens": 84583829.0, "step": 87 }, { "epoch": 0.56957928802589, "grad_norm": 0.23168902099132538, "learning_rate": 5e-05, "loss": 0.4129, "num_tokens": 85554557.0, "step": 88 }, { "epoch": 0.5760517799352751, "grad_norm": 0.28108587861061096, "learning_rate": 5e-05, "loss": 0.4151, "num_tokens": 86534800.0, "step": 89 }, { "epoch": 0.5825242718446602, "grad_norm": 0.2914122939109802, "learning_rate": 5e-05, "loss": 0.4186, "num_tokens": 87479322.0, "step": 90 }, { "epoch": 0.5889967637540453, "grad_norm": 0.261422723531723, "learning_rate": 5e-05, "loss": 0.4159, "num_tokens": 88440604.0, "step": 91 }, { "epoch": 0.5954692556634305, "grad_norm": 0.22620953619480133, "learning_rate": 5e-05, "loss": 0.413, "num_tokens": 89440902.0, "step": 92 }, { "epoch": 0.6019417475728155, "grad_norm": 0.2577684223651886, "learning_rate": 5e-05, "loss": 0.4063, "num_tokens": 90420292.0, "step": 93 }, { "epoch": 0.6084142394822006, "grad_norm": 0.24958066642284393, "learning_rate": 5e-05, "loss": 0.4132, "num_tokens": 91365927.0, "step": 94 }, { "epoch": 0.6148867313915858, "grad_norm": 0.19927746057510376, "learning_rate": 5e-05, "loss": 0.4086, "num_tokens": 92339481.0, "step": 95 }, { "epoch": 0.6213592233009708, "grad_norm": 0.20518355071544647, "learning_rate": 5e-05, "loss": 0.4034, "num_tokens": 93277920.0, "step": 96 }, { "epoch": 0.627831715210356, "grad_norm": 0.2711578607559204, "learning_rate": 5e-05, "loss": 0.4035, "num_tokens": 94233058.0, "step": 97 }, { "epoch": 0.6343042071197411, "grad_norm": 0.2453780472278595, "learning_rate": 5e-05, "loss": 0.4166, "num_tokens": 95195865.0, "step": 98 }, { "epoch": 0.6407766990291263, "grad_norm": 0.2487688809633255, "learning_rate": 5e-05, "loss": 0.3997, "num_tokens": 96165533.0, "step": 99 }, { "epoch": 0.6472491909385113, "grad_norm": 0.2185203731060028, "learning_rate": 5e-05, "loss": 0.4015, "num_tokens": 97128908.0, "step": 100 }, { "epoch": 0.6537216828478964, "grad_norm": 0.27545610070228577, "learning_rate": 5e-05, "loss": 0.4038, "num_tokens": 98137812.0, "step": 101 }, { "epoch": 0.6601941747572816, "grad_norm": 0.2493947595357895, "learning_rate": 5e-05, "loss": 0.4001, "num_tokens": 99133445.0, "step": 102 }, { "epoch": 0.6666666666666666, "grad_norm": 0.23615600168704987, "learning_rate": 5e-05, "loss": 0.3929, "num_tokens": 100106034.0, "step": 103 }, { "epoch": 0.6731391585760518, "grad_norm": 0.20948085188865662, "learning_rate": 5e-05, "loss": 0.4192, "num_tokens": 101060830.0, "step": 104 }, { "epoch": 0.6796116504854369, "grad_norm": 0.23341938853263855, "learning_rate": 5e-05, "loss": 0.4038, "num_tokens": 102032708.0, "step": 105 }, { "epoch": 0.686084142394822, "grad_norm": 0.2857361435890198, "learning_rate": 5e-05, "loss": 0.3886, "num_tokens": 103020888.0, "step": 106 }, { "epoch": 0.6925566343042071, "grad_norm": 0.22391542792320251, "learning_rate": 5e-05, "loss": 0.4105, "num_tokens": 103986883.0, "step": 107 }, { "epoch": 0.6990291262135923, "grad_norm": 0.2189360409975052, "learning_rate": 5e-05, "loss": 0.4075, "num_tokens": 104978514.0, "step": 108 }, { "epoch": 0.7055016181229773, "grad_norm": 0.2077621966600418, "learning_rate": 5e-05, "loss": 0.3949, "num_tokens": 105938542.0, "step": 109 }, { "epoch": 0.7119741100323624, "grad_norm": 0.2519223690032959, "learning_rate": 5e-05, "loss": 0.3981, "num_tokens": 106909993.0, "step": 110 }, { "epoch": 0.7184466019417476, "grad_norm": 0.2587706744670868, "learning_rate": 5e-05, "loss": 0.4019, "num_tokens": 107872313.0, "step": 111 }, { "epoch": 0.7249190938511327, "grad_norm": 0.20526064932346344, "learning_rate": 5e-05, "loss": 0.3955, "num_tokens": 108876000.0, "step": 112 }, { "epoch": 0.7313915857605178, "grad_norm": 0.2061409205198288, "learning_rate": 5e-05, "loss": 0.3957, "num_tokens": 109857790.0, "step": 113 }, { "epoch": 0.7378640776699029, "grad_norm": 0.2351228892803192, "learning_rate": 5e-05, "loss": 0.397, "num_tokens": 110826497.0, "step": 114 }, { "epoch": 0.7443365695792881, "grad_norm": 0.2670630216598511, "learning_rate": 5e-05, "loss": 0.3968, "num_tokens": 111775780.0, "step": 115 }, { "epoch": 0.7508090614886731, "grad_norm": 0.2395128458738327, "learning_rate": 5e-05, "loss": 0.3983, "num_tokens": 112755203.0, "step": 116 }, { "epoch": 0.7572815533980582, "grad_norm": 0.21340428292751312, "learning_rate": 5e-05, "loss": 0.3979, "num_tokens": 113720983.0, "step": 117 }, { "epoch": 0.7637540453074434, "grad_norm": 0.2701132893562317, "learning_rate": 5e-05, "loss": 0.3899, "num_tokens": 114690500.0, "step": 118 }, { "epoch": 0.7702265372168284, "grad_norm": 0.21103884279727936, "learning_rate": 5e-05, "loss": 0.3897, "num_tokens": 115663752.0, "step": 119 }, { "epoch": 0.7766990291262136, "grad_norm": 0.2516920864582062, "learning_rate": 5e-05, "loss": 0.3879, "num_tokens": 116656118.0, "step": 120 }, { "epoch": 0.7831715210355987, "grad_norm": 0.2821183204650879, "learning_rate": 5e-05, "loss": 0.3975, "num_tokens": 117647006.0, "step": 121 }, { "epoch": 0.7896440129449838, "grad_norm": 0.23541994392871857, "learning_rate": 5e-05, "loss": 0.3883, "num_tokens": 118637898.0, "step": 122 }, { "epoch": 0.7961165048543689, "grad_norm": 0.2274329662322998, "learning_rate": 5e-05, "loss": 0.3966, "num_tokens": 119611539.0, "step": 123 }, { "epoch": 0.8025889967637541, "grad_norm": 0.2166299968957901, "learning_rate": 5e-05, "loss": 0.3977, "num_tokens": 120576485.0, "step": 124 }, { "epoch": 0.8090614886731392, "grad_norm": 0.2949488162994385, "learning_rate": 5e-05, "loss": 0.4001, "num_tokens": 121567862.0, "step": 125 }, { "epoch": 0.8155339805825242, "grad_norm": 0.20616242289543152, "learning_rate": 5e-05, "loss": 0.3904, "num_tokens": 122530893.0, "step": 126 }, { "epoch": 0.8220064724919094, "grad_norm": 0.2675597667694092, "learning_rate": 5e-05, "loss": 0.3915, "num_tokens": 123480227.0, "step": 127 }, { "epoch": 0.8284789644012945, "grad_norm": 0.2509215772151947, "learning_rate": 5e-05, "loss": 0.4052, "num_tokens": 124462070.0, "step": 128 }, { "epoch": 0.8349514563106796, "grad_norm": 0.22263118624687195, "learning_rate": 5e-05, "loss": 0.3988, "num_tokens": 125440428.0, "step": 129 }, { "epoch": 0.8414239482200647, "grad_norm": 0.2704783082008362, "learning_rate": 5e-05, "loss": 0.395, "num_tokens": 126416310.0, "step": 130 }, { "epoch": 0.8478964401294499, "grad_norm": 0.230453222990036, "learning_rate": 5e-05, "loss": 0.3939, "num_tokens": 127406436.0, "step": 131 }, { "epoch": 0.8543689320388349, "grad_norm": 0.21275080740451813, "learning_rate": 5e-05, "loss": 0.3926, "num_tokens": 128372197.0, "step": 132 }, { "epoch": 0.86084142394822, "grad_norm": 0.20798979699611664, "learning_rate": 5e-05, "loss": 0.3893, "num_tokens": 129371264.0, "step": 133 }, { "epoch": 0.8673139158576052, "grad_norm": 0.25022491812705994, "learning_rate": 5e-05, "loss": 0.3772, "num_tokens": 130306080.0, "step": 134 }, { "epoch": 0.8737864077669902, "grad_norm": 0.19588612020015717, "learning_rate": 5e-05, "loss": 0.3956, "num_tokens": 131261937.0, "step": 135 }, { "epoch": 0.8802588996763754, "grad_norm": 0.2540820837020874, "learning_rate": 5e-05, "loss": 0.3894, "num_tokens": 132223182.0, "step": 136 }, { "epoch": 0.8867313915857605, "grad_norm": 0.20858538150787354, "learning_rate": 5e-05, "loss": 0.3927, "num_tokens": 133211562.0, "step": 137 }, { "epoch": 0.8932038834951457, "grad_norm": 0.2451946884393692, "learning_rate": 5e-05, "loss": 0.3907, "num_tokens": 134208133.0, "step": 138 }, { "epoch": 0.8996763754045307, "grad_norm": 0.2214939147233963, "learning_rate": 5e-05, "loss": 0.393, "num_tokens": 135194807.0, "step": 139 }, { "epoch": 0.9061488673139159, "grad_norm": 0.18540489673614502, "learning_rate": 5e-05, "loss": 0.3913, "num_tokens": 136175941.0, "step": 140 }, { "epoch": 0.912621359223301, "grad_norm": 0.2511117458343506, "learning_rate": 5e-05, "loss": 0.4022, "num_tokens": 137117638.0, "step": 141 }, { "epoch": 0.919093851132686, "grad_norm": 0.17371752858161926, "learning_rate": 5e-05, "loss": 0.3808, "num_tokens": 138102412.0, "step": 142 }, { "epoch": 0.9255663430420712, "grad_norm": 0.20003624260425568, "learning_rate": 5e-05, "loss": 0.3833, "num_tokens": 139056304.0, "step": 143 }, { "epoch": 0.9320388349514563, "grad_norm": 0.24626004695892334, "learning_rate": 5e-05, "loss": 0.3894, "num_tokens": 140002988.0, "step": 144 }, { "epoch": 0.9385113268608414, "grad_norm": 0.23707589507102966, "learning_rate": 5e-05, "loss": 0.3924, "num_tokens": 140939323.0, "step": 145 }, { "epoch": 0.9449838187702265, "grad_norm": 0.23435530066490173, "learning_rate": 5e-05, "loss": 0.3866, "num_tokens": 141898246.0, "step": 146 }, { "epoch": 0.9514563106796117, "grad_norm": 0.3063051104545593, "learning_rate": 5e-05, "loss": 0.3951, "num_tokens": 142861118.0, "step": 147 }, { "epoch": 0.9579288025889967, "grad_norm": 0.18616275489330292, "learning_rate": 5e-05, "loss": 0.3882, "num_tokens": 143845628.0, "step": 148 }, { "epoch": 0.9644012944983819, "grad_norm": 0.21778912842273712, "learning_rate": 5e-05, "loss": 0.3987, "num_tokens": 144816454.0, "step": 149 }, { "epoch": 0.970873786407767, "grad_norm": 0.27881455421447754, "learning_rate": 5e-05, "loss": 0.389, "num_tokens": 145791528.0, "step": 150 }, { "epoch": 0.9773462783171522, "grad_norm": 0.1892063021659851, "learning_rate": 5e-05, "loss": 0.4004, "num_tokens": 146776225.0, "step": 151 }, { "epoch": 0.9838187702265372, "grad_norm": 0.2594713866710663, "learning_rate": 5e-05, "loss": 0.3845, "num_tokens": 147716038.0, "step": 152 }, { "epoch": 0.9902912621359223, "grad_norm": 0.18184727430343628, "learning_rate": 5e-05, "loss": 0.3804, "num_tokens": 148641036.0, "step": 153 }, { "epoch": 0.9967637540453075, "grad_norm": 0.21150894463062286, "learning_rate": 5e-05, "loss": 0.3844, "num_tokens": 149611850.0, "step": 154 }, { "epoch": 1.0, "grad_norm": 0.21150894463062286, "learning_rate": 5e-05, "loss": 0.386, "num_tokens": 150082857.0, "step": 155 }, { "epoch": 1.006472491909385, "grad_norm": 0.3808545768260956, "learning_rate": 5e-05, "loss": 0.3396, "num_tokens": 151065207.0, "step": 156 }, { "epoch": 1.0129449838187703, "grad_norm": 0.2388165444135666, "learning_rate": 5e-05, "loss": 0.3318, "num_tokens": 152046682.0, "step": 157 }, { "epoch": 1.0194174757281553, "grad_norm": 0.2872416377067566, "learning_rate": 5e-05, "loss": 0.3343, "num_tokens": 153014543.0, "step": 158 }, { "epoch": 1.0258899676375404, "grad_norm": 0.2236020863056183, "learning_rate": 5e-05, "loss": 0.3372, "num_tokens": 153988434.0, "step": 159 }, { "epoch": 1.0323624595469256, "grad_norm": 0.2157917320728302, "learning_rate": 5e-05, "loss": 0.3376, "num_tokens": 154938356.0, "step": 160 }, { "epoch": 1.0388349514563107, "grad_norm": 0.23544207215309143, "learning_rate": 5e-05, "loss": 0.3369, "num_tokens": 155920665.0, "step": 161 }, { "epoch": 1.0453074433656957, "grad_norm": 0.247762992978096, "learning_rate": 5e-05, "loss": 0.3366, "num_tokens": 156878844.0, "step": 162 }, { "epoch": 1.051779935275081, "grad_norm": 0.25991392135620117, "learning_rate": 5e-05, "loss": 0.3307, "num_tokens": 157852056.0, "step": 163 }, { "epoch": 1.058252427184466, "grad_norm": 0.20868179202079773, "learning_rate": 5e-05, "loss": 0.337, "num_tokens": 158847308.0, "step": 164 }, { "epoch": 1.064724919093851, "grad_norm": 0.20805694162845612, "learning_rate": 5e-05, "loss": 0.339, "num_tokens": 159815676.0, "step": 165 }, { "epoch": 1.0711974110032363, "grad_norm": 0.21457871794700623, "learning_rate": 5e-05, "loss": 0.3359, "num_tokens": 160783247.0, "step": 166 }, { "epoch": 1.0776699029126213, "grad_norm": 0.26445016264915466, "learning_rate": 5e-05, "loss": 0.3315, "num_tokens": 161757978.0, "step": 167 }, { "epoch": 1.0841423948220066, "grad_norm": 0.2563857436180115, "learning_rate": 5e-05, "loss": 0.3291, "num_tokens": 162697927.0, "step": 168 }, { "epoch": 1.0906148867313916, "grad_norm": 0.25436389446258545, "learning_rate": 5e-05, "loss": 0.3358, "num_tokens": 163662092.0, "step": 169 }, { "epoch": 1.0970873786407767, "grad_norm": 0.20826613903045654, "learning_rate": 5e-05, "loss": 0.3277, "num_tokens": 164636005.0, "step": 170 }, { "epoch": 1.103559870550162, "grad_norm": 0.19969569146633148, "learning_rate": 5e-05, "loss": 0.3331, "num_tokens": 165575709.0, "step": 171 }, { "epoch": 1.110032362459547, "grad_norm": 0.19038818776607513, "learning_rate": 5e-05, "loss": 0.3211, "num_tokens": 166581172.0, "step": 172 }, { "epoch": 1.116504854368932, "grad_norm": 0.1880563646554947, "learning_rate": 5e-05, "loss": 0.3313, "num_tokens": 167529454.0, "step": 173 }, { "epoch": 1.1229773462783172, "grad_norm": 0.20041118562221527, "learning_rate": 5e-05, "loss": 0.3234, "num_tokens": 168484414.0, "step": 174 }, { "epoch": 1.1294498381877023, "grad_norm": 0.23865336179733276, "learning_rate": 5e-05, "loss": 0.3349, "num_tokens": 169452401.0, "step": 175 }, { "epoch": 1.1359223300970873, "grad_norm": 0.23407898843288422, "learning_rate": 5e-05, "loss": 0.3273, "num_tokens": 170407712.0, "step": 176 }, { "epoch": 1.1423948220064726, "grad_norm": 0.20350106060504913, "learning_rate": 5e-05, "loss": 0.3359, "num_tokens": 171418075.0, "step": 177 }, { "epoch": 1.1488673139158576, "grad_norm": 0.2573348581790924, "learning_rate": 5e-05, "loss": 0.3329, "num_tokens": 172409907.0, "step": 178 }, { "epoch": 1.1553398058252426, "grad_norm": 0.23013430833816528, "learning_rate": 5e-05, "loss": 0.3281, "num_tokens": 173368352.0, "step": 179 }, { "epoch": 1.161812297734628, "grad_norm": 0.2082006186246872, "learning_rate": 5e-05, "loss": 0.3328, "num_tokens": 174328021.0, "step": 180 }, { "epoch": 1.168284789644013, "grad_norm": 0.20928917825222015, "learning_rate": 5e-05, "loss": 0.331, "num_tokens": 175328614.0, "step": 181 }, { "epoch": 1.174757281553398, "grad_norm": 0.1838248372077942, "learning_rate": 5e-05, "loss": 0.3298, "num_tokens": 176287442.0, "step": 182 }, { "epoch": 1.1812297734627832, "grad_norm": 0.1923965960741043, "learning_rate": 5e-05, "loss": 0.3364, "num_tokens": 177272590.0, "step": 183 }, { "epoch": 1.1877022653721683, "grad_norm": 0.23095349967479706, "learning_rate": 5e-05, "loss": 0.3384, "num_tokens": 178243495.0, "step": 184 }, { "epoch": 1.1941747572815533, "grad_norm": 0.21443037688732147, "learning_rate": 5e-05, "loss": 0.3341, "num_tokens": 179230835.0, "step": 185 }, { "epoch": 1.2006472491909386, "grad_norm": 0.18394804000854492, "learning_rate": 5e-05, "loss": 0.3299, "num_tokens": 180197716.0, "step": 186 }, { "epoch": 1.2071197411003236, "grad_norm": 0.19485121965408325, "learning_rate": 5e-05, "loss": 0.3407, "num_tokens": 181168136.0, "step": 187 }, { "epoch": 1.2135922330097086, "grad_norm": 0.18240030109882355, "learning_rate": 5e-05, "loss": 0.3319, "num_tokens": 182127126.0, "step": 188 }, { "epoch": 1.220064724919094, "grad_norm": 0.21046940982341766, "learning_rate": 5e-05, "loss": 0.3347, "num_tokens": 183085892.0, "step": 189 }, { "epoch": 1.226537216828479, "grad_norm": 0.2162950187921524, "learning_rate": 5e-05, "loss": 0.331, "num_tokens": 184057299.0, "step": 190 }, { "epoch": 1.233009708737864, "grad_norm": 0.19470985233783722, "learning_rate": 5e-05, "loss": 0.3294, "num_tokens": 184998791.0, "step": 191 }, { "epoch": 1.2394822006472492, "grad_norm": 0.221969872713089, "learning_rate": 5e-05, "loss": 0.3275, "num_tokens": 185986785.0, "step": 192 }, { "epoch": 1.2459546925566343, "grad_norm": 0.23872560262680054, "learning_rate": 5e-05, "loss": 0.3238, "num_tokens": 186929881.0, "step": 193 }, { "epoch": 1.2524271844660193, "grad_norm": 0.20502091944217682, "learning_rate": 5e-05, "loss": 0.3372, "num_tokens": 187902972.0, "step": 194 }, { "epoch": 1.2588996763754046, "grad_norm": 0.2178352177143097, "learning_rate": 5e-05, "loss": 0.3249, "num_tokens": 188884007.0, "step": 195 }, { "epoch": 1.2653721682847896, "grad_norm": 0.23162619769573212, "learning_rate": 5e-05, "loss": 0.3249, "num_tokens": 189812757.0, "step": 196 }, { "epoch": 1.2718446601941746, "grad_norm": 0.1654340624809265, "learning_rate": 5e-05, "loss": 0.3277, "num_tokens": 190775296.0, "step": 197 }, { "epoch": 1.27831715210356, "grad_norm": 0.20392604172229767, "learning_rate": 5e-05, "loss": 0.3313, "num_tokens": 191750113.0, "step": 198 }, { "epoch": 1.284789644012945, "grad_norm": 0.24326327443122864, "learning_rate": 5e-05, "loss": 0.3313, "num_tokens": 192691419.0, "step": 199 }, { "epoch": 1.29126213592233, "grad_norm": 0.19420462846755981, "learning_rate": 5e-05, "loss": 0.335, "num_tokens": 193680379.0, "step": 200 }, { "epoch": 1.2977346278317152, "grad_norm": 0.19755037128925323, "learning_rate": 5e-05, "loss": 0.3285, "num_tokens": 194655500.0, "step": 201 }, { "epoch": 1.3042071197411003, "grad_norm": 0.20094521343708038, "learning_rate": 5e-05, "loss": 0.333, "num_tokens": 195598008.0, "step": 202 }, { "epoch": 1.3106796116504853, "grad_norm": 0.22174061834812164, "learning_rate": 5e-05, "loss": 0.3367, "num_tokens": 196583549.0, "step": 203 }, { "epoch": 1.3171521035598706, "grad_norm": 0.20323283970355988, "learning_rate": 5e-05, "loss": 0.3334, "num_tokens": 197555491.0, "step": 204 }, { "epoch": 1.3236245954692556, "grad_norm": 0.20889979600906372, "learning_rate": 5e-05, "loss": 0.3274, "num_tokens": 198552040.0, "step": 205 }, { "epoch": 1.3300970873786409, "grad_norm": 0.213238924741745, "learning_rate": 5e-05, "loss": 0.3276, "num_tokens": 199517007.0, "step": 206 }, { "epoch": 1.3365695792880259, "grad_norm": 0.23167595267295837, "learning_rate": 5e-05, "loss": 0.3335, "num_tokens": 200490485.0, "step": 207 }, { "epoch": 1.343042071197411, "grad_norm": 0.17343296110630035, "learning_rate": 5e-05, "loss": 0.3378, "num_tokens": 201459121.0, "step": 208 }, { "epoch": 1.3495145631067962, "grad_norm": 0.2144741266965866, "learning_rate": 5e-05, "loss": 0.3281, "num_tokens": 202378391.0, "step": 209 }, { "epoch": 1.3559870550161812, "grad_norm": 0.21159258484840393, "learning_rate": 5e-05, "loss": 0.3365, "num_tokens": 203389991.0, "step": 210 }, { "epoch": 1.3624595469255665, "grad_norm": 0.19958184659481049, "learning_rate": 5e-05, "loss": 0.3347, "num_tokens": 204350028.0, "step": 211 }, { "epoch": 1.3689320388349515, "grad_norm": 0.24989768862724304, "learning_rate": 5e-05, "loss": 0.3355, "num_tokens": 205321725.0, "step": 212 }, { "epoch": 1.3754045307443366, "grad_norm": 0.20174843072891235, "learning_rate": 5e-05, "loss": 0.3354, "num_tokens": 206300400.0, "step": 213 }, { "epoch": 1.3818770226537218, "grad_norm": 0.21709489822387695, "learning_rate": 5e-05, "loss": 0.3299, "num_tokens": 207269349.0, "step": 214 }, { "epoch": 1.3883495145631068, "grad_norm": 0.18790926039218903, "learning_rate": 5e-05, "loss": 0.3305, "num_tokens": 208239995.0, "step": 215 }, { "epoch": 1.3948220064724919, "grad_norm": 0.20517094433307648, "learning_rate": 5e-05, "loss": 0.3363, "num_tokens": 209197405.0, "step": 216 }, { "epoch": 1.4012944983818771, "grad_norm": 0.21117077767848969, "learning_rate": 5e-05, "loss": 0.3288, "num_tokens": 210173069.0, "step": 217 }, { "epoch": 1.4077669902912622, "grad_norm": 0.2215874046087265, "learning_rate": 5e-05, "loss": 0.3285, "num_tokens": 211140476.0, "step": 218 }, { "epoch": 1.4142394822006472, "grad_norm": 0.23090392351150513, "learning_rate": 5e-05, "loss": 0.3286, "num_tokens": 212101096.0, "step": 219 }, { "epoch": 1.4207119741100325, "grad_norm": 0.2175295352935791, "learning_rate": 5e-05, "loss": 0.3336, "num_tokens": 213068720.0, "step": 220 }, { "epoch": 1.4271844660194175, "grad_norm": 0.19563759863376617, "learning_rate": 5e-05, "loss": 0.3336, "num_tokens": 214020099.0, "step": 221 }, { "epoch": 1.4336569579288025, "grad_norm": 0.18170210719108582, "learning_rate": 5e-05, "loss": 0.3277, "num_tokens": 214947806.0, "step": 222 }, { "epoch": 1.4401294498381878, "grad_norm": 0.1930006891489029, "learning_rate": 5e-05, "loss": 0.3301, "num_tokens": 215925598.0, "step": 223 }, { "epoch": 1.4466019417475728, "grad_norm": 0.1807258129119873, "learning_rate": 5e-05, "loss": 0.3273, "num_tokens": 216915650.0, "step": 224 }, { "epoch": 1.4530744336569579, "grad_norm": 0.17435649037361145, "learning_rate": 5e-05, "loss": 0.3328, "num_tokens": 217865373.0, "step": 225 }, { "epoch": 1.4595469255663431, "grad_norm": 0.15944595634937286, "learning_rate": 5e-05, "loss": 0.3253, "num_tokens": 218823612.0, "step": 226 }, { "epoch": 1.4660194174757282, "grad_norm": 0.17691123485565186, "learning_rate": 5e-05, "loss": 0.3252, "num_tokens": 219799962.0, "step": 227 }, { "epoch": 1.4724919093851132, "grad_norm": 0.18110305070877075, "learning_rate": 5e-05, "loss": 0.3334, "num_tokens": 220771199.0, "step": 228 }, { "epoch": 1.4789644012944985, "grad_norm": 0.1774352341890335, "learning_rate": 5e-05, "loss": 0.3279, "num_tokens": 221736326.0, "step": 229 }, { "epoch": 1.4854368932038835, "grad_norm": 0.19598598778247833, "learning_rate": 5e-05, "loss": 0.3305, "num_tokens": 222710609.0, "step": 230 }, { "epoch": 1.4919093851132685, "grad_norm": 0.22492715716362, "learning_rate": 5e-05, "loss": 0.327, "num_tokens": 223688658.0, "step": 231 }, { "epoch": 1.4983818770226538, "grad_norm": 0.2155875563621521, "learning_rate": 5e-05, "loss": 0.331, "num_tokens": 224689265.0, "step": 232 }, { "epoch": 1.5048543689320388, "grad_norm": 0.196323424577713, "learning_rate": 5e-05, "loss": 0.3286, "num_tokens": 225643475.0, "step": 233 }, { "epoch": 1.5113268608414239, "grad_norm": 0.2056785374879837, "learning_rate": 5e-05, "loss": 0.3323, "num_tokens": 226617652.0, "step": 234 }, { "epoch": 1.5177993527508091, "grad_norm": 0.19503039121627808, "learning_rate": 5e-05, "loss": 0.3298, "num_tokens": 227553267.0, "step": 235 }, { "epoch": 1.5242718446601942, "grad_norm": 0.18793903291225433, "learning_rate": 5e-05, "loss": 0.3302, "num_tokens": 228538404.0, "step": 236 }, { "epoch": 1.5307443365695792, "grad_norm": 0.19170236587524414, "learning_rate": 5e-05, "loss": 0.3284, "num_tokens": 229526832.0, "step": 237 }, { "epoch": 1.5372168284789645, "grad_norm": 0.18152864277362823, "learning_rate": 5e-05, "loss": 0.3337, "num_tokens": 230514566.0, "step": 238 }, { "epoch": 1.5436893203883495, "grad_norm": 0.24174553155899048, "learning_rate": 5e-05, "loss": 0.3308, "num_tokens": 231499488.0, "step": 239 }, { "epoch": 1.5501618122977345, "grad_norm": 0.16658781468868256, "learning_rate": 5e-05, "loss": 0.3401, "num_tokens": 232467260.0, "step": 240 }, { "epoch": 1.5566343042071198, "grad_norm": 0.23242627084255219, "learning_rate": 5e-05, "loss": 0.3331, "num_tokens": 233453448.0, "step": 241 }, { "epoch": 1.5631067961165048, "grad_norm": 0.20731788873672485, "learning_rate": 5e-05, "loss": 0.3272, "num_tokens": 234468336.0, "step": 242 }, { "epoch": 1.5695792880258899, "grad_norm": 0.190266415476799, "learning_rate": 5e-05, "loss": 0.3351, "num_tokens": 235437157.0, "step": 243 }, { "epoch": 1.5760517799352751, "grad_norm": 0.19828887283802032, "learning_rate": 5e-05, "loss": 0.3375, "num_tokens": 236420720.0, "step": 244 }, { "epoch": 1.5825242718446602, "grad_norm": 0.21173498034477234, "learning_rate": 5e-05, "loss": 0.3331, "num_tokens": 237408717.0, "step": 245 }, { "epoch": 1.5889967637540452, "grad_norm": 0.1708352416753769, "learning_rate": 5e-05, "loss": 0.3357, "num_tokens": 238399497.0, "step": 246 }, { "epoch": 1.5954692556634305, "grad_norm": 0.17573760449886322, "learning_rate": 5e-05, "loss": 0.3376, "num_tokens": 239341614.0, "step": 247 }, { "epoch": 1.6019417475728155, "grad_norm": 0.20564796030521393, "learning_rate": 5e-05, "loss": 0.3309, "num_tokens": 240303758.0, "step": 248 }, { "epoch": 1.6084142394822005, "grad_norm": 0.18849660456180573, "learning_rate": 5e-05, "loss": 0.3322, "num_tokens": 241254594.0, "step": 249 }, { "epoch": 1.6148867313915858, "grad_norm": 0.18031689524650574, "learning_rate": 5e-05, "loss": 0.3357, "num_tokens": 242199383.0, "step": 250 }, { "epoch": 1.6213592233009708, "grad_norm": 0.21074460446834564, "learning_rate": 5e-05, "loss": 0.3304, "num_tokens": 243205373.0, "step": 251 }, { "epoch": 1.6278317152103559, "grad_norm": 0.1825997680425644, "learning_rate": 5e-05, "loss": 0.3231, "num_tokens": 244177909.0, "step": 252 }, { "epoch": 1.6343042071197411, "grad_norm": 0.17466188967227936, "learning_rate": 5e-05, "loss": 0.3369, "num_tokens": 245165218.0, "step": 253 }, { "epoch": 1.6407766990291264, "grad_norm": 0.2154398262500763, "learning_rate": 5e-05, "loss": 0.3268, "num_tokens": 246106607.0, "step": 254 }, { "epoch": 1.6472491909385112, "grad_norm": 0.2120603770017624, "learning_rate": 5e-05, "loss": 0.3332, "num_tokens": 247086207.0, "step": 255 }, { "epoch": 1.6537216828478964, "grad_norm": 0.2497498095035553, "learning_rate": 5e-05, "loss": 0.3352, "num_tokens": 248068946.0, "step": 256 }, { "epoch": 1.6601941747572817, "grad_norm": 0.1900237500667572, "learning_rate": 5e-05, "loss": 0.3284, "num_tokens": 249047854.0, "step": 257 }, { "epoch": 1.6666666666666665, "grad_norm": 0.17197246849536896, "learning_rate": 5e-05, "loss": 0.3382, "num_tokens": 250033293.0, "step": 258 }, { "epoch": 1.6731391585760518, "grad_norm": 0.20038750767707825, "learning_rate": 5e-05, "loss": 0.3348, "num_tokens": 251008614.0, "step": 259 }, { "epoch": 1.679611650485437, "grad_norm": 0.28472045063972473, "learning_rate": 5e-05, "loss": 0.3273, "num_tokens": 251959677.0, "step": 260 }, { "epoch": 1.6860841423948218, "grad_norm": 0.17213083803653717, "learning_rate": 5e-05, "loss": 0.3369, "num_tokens": 252935571.0, "step": 261 }, { "epoch": 1.692556634304207, "grad_norm": 0.23536911606788635, "learning_rate": 5e-05, "loss": 0.3356, "num_tokens": 253879032.0, "step": 262 }, { "epoch": 1.6990291262135924, "grad_norm": 0.24911221861839294, "learning_rate": 5e-05, "loss": 0.3222, "num_tokens": 254826418.0, "step": 263 }, { "epoch": 1.7055016181229772, "grad_norm": 0.1864347755908966, "learning_rate": 5e-05, "loss": 0.3345, "num_tokens": 255785864.0, "step": 264 }, { "epoch": 1.7119741100323624, "grad_norm": 0.188665509223938, "learning_rate": 5e-05, "loss": 0.3288, "num_tokens": 256751813.0, "step": 265 }, { "epoch": 1.7184466019417477, "grad_norm": 0.20322351157665253, "learning_rate": 5e-05, "loss": 0.335, "num_tokens": 257721834.0, "step": 266 }, { "epoch": 1.7249190938511327, "grad_norm": 0.17994093894958496, "learning_rate": 5e-05, "loss": 0.3333, "num_tokens": 258673121.0, "step": 267 }, { "epoch": 1.7313915857605178, "grad_norm": 0.17940597236156464, "learning_rate": 5e-05, "loss": 0.3389, "num_tokens": 259660088.0, "step": 268 }, { "epoch": 1.737864077669903, "grad_norm": 0.1803252249956131, "learning_rate": 5e-05, "loss": 0.3378, "num_tokens": 260659515.0, "step": 269 }, { "epoch": 1.744336569579288, "grad_norm": 0.197081059217453, "learning_rate": 5e-05, "loss": 0.3336, "num_tokens": 261673283.0, "step": 270 }, { "epoch": 1.750809061488673, "grad_norm": 0.19758829474449158, "learning_rate": 5e-05, "loss": 0.3256, "num_tokens": 262663793.0, "step": 271 }, { "epoch": 1.7572815533980584, "grad_norm": 0.2050763964653015, "learning_rate": 5e-05, "loss": 0.332, "num_tokens": 263631573.0, "step": 272 }, { "epoch": 1.7637540453074434, "grad_norm": 0.17315641045570374, "learning_rate": 5e-05, "loss": 0.337, "num_tokens": 264625690.0, "step": 273 }, { "epoch": 1.7702265372168284, "grad_norm": 0.19733166694641113, "learning_rate": 5e-05, "loss": 0.3232, "num_tokens": 265585386.0, "step": 274 }, { "epoch": 1.7766990291262137, "grad_norm": 0.17315465211868286, "learning_rate": 5e-05, "loss": 0.3397, "num_tokens": 266565331.0, "step": 275 }, { "epoch": 1.7831715210355987, "grad_norm": 0.18754848837852478, "learning_rate": 5e-05, "loss": 0.3297, "num_tokens": 267570388.0, "step": 276 }, { "epoch": 1.7896440129449838, "grad_norm": 0.20075517892837524, "learning_rate": 5e-05, "loss": 0.3307, "num_tokens": 268536852.0, "step": 277 }, { "epoch": 1.796116504854369, "grad_norm": 0.19061920046806335, "learning_rate": 5e-05, "loss": 0.337, "num_tokens": 269499895.0, "step": 278 }, { "epoch": 1.802588996763754, "grad_norm": 0.18604253232479095, "learning_rate": 5e-05, "loss": 0.3281, "num_tokens": 270449454.0, "step": 279 }, { "epoch": 1.809061488673139, "grad_norm": 0.19853651523590088, "learning_rate": 5e-05, "loss": 0.3265, "num_tokens": 271445748.0, "step": 280 }, { "epoch": 1.8155339805825244, "grad_norm": 0.17544469237327576, "learning_rate": 5e-05, "loss": 0.3333, "num_tokens": 272437678.0, "step": 281 }, { "epoch": 1.8220064724919094, "grad_norm": 0.18316155672073364, "learning_rate": 5e-05, "loss": 0.3351, "num_tokens": 273421632.0, "step": 282 }, { "epoch": 1.8284789644012944, "grad_norm": 0.16321514546871185, "learning_rate": 5e-05, "loss": 0.3258, "num_tokens": 274382121.0, "step": 283 }, { "epoch": 1.8349514563106797, "grad_norm": 0.17352738976478577, "learning_rate": 5e-05, "loss": 0.3229, "num_tokens": 275355409.0, "step": 284 }, { "epoch": 1.8414239482200647, "grad_norm": 0.19399261474609375, "learning_rate": 5e-05, "loss": 0.3269, "num_tokens": 276363366.0, "step": 285 }, { "epoch": 1.8478964401294498, "grad_norm": 0.18889741599559784, "learning_rate": 5e-05, "loss": 0.3331, "num_tokens": 277316291.0, "step": 286 }, { "epoch": 1.854368932038835, "grad_norm": 0.20103295147418976, "learning_rate": 5e-05, "loss": 0.3323, "num_tokens": 278277485.0, "step": 287 }, { "epoch": 1.86084142394822, "grad_norm": 0.21133628487586975, "learning_rate": 5e-05, "loss": 0.3255, "num_tokens": 279263638.0, "step": 288 }, { "epoch": 1.867313915857605, "grad_norm": 0.19138126075267792, "learning_rate": 5e-05, "loss": 0.326, "num_tokens": 280229543.0, "step": 289 }, { "epoch": 1.8737864077669903, "grad_norm": 0.16682711243629456, "learning_rate": 5e-05, "loss": 0.3255, "num_tokens": 281207364.0, "step": 290 }, { "epoch": 1.8802588996763754, "grad_norm": 0.1983303278684616, "learning_rate": 5e-05, "loss": 0.3244, "num_tokens": 282156358.0, "step": 291 }, { "epoch": 1.8867313915857604, "grad_norm": 0.22128702700138092, "learning_rate": 5e-05, "loss": 0.328, "num_tokens": 283119435.0, "step": 292 }, { "epoch": 1.8932038834951457, "grad_norm": 0.1848825216293335, "learning_rate": 5e-05, "loss": 0.334, "num_tokens": 284084676.0, "step": 293 }, { "epoch": 1.8996763754045307, "grad_norm": 0.2012568563222885, "learning_rate": 5e-05, "loss": 0.3462, "num_tokens": 285051682.0, "step": 294 }, { "epoch": 1.9061488673139158, "grad_norm": 0.19058983027935028, "learning_rate": 5e-05, "loss": 0.33, "num_tokens": 286037634.0, "step": 295 }, { "epoch": 1.912621359223301, "grad_norm": 0.17715521156787872, "learning_rate": 5e-05, "loss": 0.3252, "num_tokens": 287028258.0, "step": 296 }, { "epoch": 1.919093851132686, "grad_norm": 0.2144545465707779, "learning_rate": 5e-05, "loss": 0.3286, "num_tokens": 287996979.0, "step": 297 }, { "epoch": 1.925566343042071, "grad_norm": 0.21287760138511658, "learning_rate": 5e-05, "loss": 0.3293, "num_tokens": 288984893.0, "step": 298 }, { "epoch": 1.9320388349514563, "grad_norm": 0.1694762259721756, "learning_rate": 5e-05, "loss": 0.3309, "num_tokens": 289948988.0, "step": 299 }, { "epoch": 1.9385113268608414, "grad_norm": 0.17792214453220367, "learning_rate": 5e-05, "loss": 0.3357, "num_tokens": 290947258.0, "step": 300 }, { "epoch": 1.9449838187702264, "grad_norm": 0.20724987983703613, "learning_rate": 5e-05, "loss": 0.3326, "num_tokens": 291896825.0, "step": 301 }, { "epoch": 1.9514563106796117, "grad_norm": 0.20822256803512573, "learning_rate": 5e-05, "loss": 0.3223, "num_tokens": 292874918.0, "step": 302 }, { "epoch": 1.9579288025889967, "grad_norm": 0.19984842836856842, "learning_rate": 5e-05, "loss": 0.3333, "num_tokens": 293858095.0, "step": 303 }, { "epoch": 1.9644012944983817, "grad_norm": 0.17398180067539215, "learning_rate": 5e-05, "loss": 0.3346, "num_tokens": 294831363.0, "step": 304 }, { "epoch": 1.970873786407767, "grad_norm": 0.15951240062713623, "learning_rate": 5e-05, "loss": 0.3253, "num_tokens": 295815029.0, "step": 305 }, { "epoch": 1.9773462783171523, "grad_norm": 0.17374330759048462, "learning_rate": 5e-05, "loss": 0.337, "num_tokens": 296783225.0, "step": 306 }, { "epoch": 1.983818770226537, "grad_norm": 0.18662990629673004, "learning_rate": 5e-05, "loss": 0.3285, "num_tokens": 297740638.0, "step": 307 }, { "epoch": 1.9902912621359223, "grad_norm": 0.18480151891708374, "learning_rate": 5e-05, "loss": 0.3391, "num_tokens": 298713508.0, "step": 308 }, { "epoch": 1.9967637540453076, "grad_norm": 0.1621216982603073, "learning_rate": 5e-05, "loss": 0.3313, "num_tokens": 299687136.0, "step": 309 }, { "epoch": 2.0, "grad_norm": 0.26159778237342834, "learning_rate": 5e-05, "loss": 0.3051, "num_tokens": 300160844.0, "step": 310 }, { "epoch": 2.0064724919093853, "grad_norm": 0.24619221687316895, "learning_rate": 5e-05, "loss": 0.2508, "num_tokens": 301102396.0, "step": 311 }, { "epoch": 2.01294498381877, "grad_norm": 0.2561141550540924, "learning_rate": 5e-05, "loss": 0.2506, "num_tokens": 302075156.0, "step": 312 }, { "epoch": 2.0194174757281553, "grad_norm": 0.24942228198051453, "learning_rate": 5e-05, "loss": 0.2493, "num_tokens": 303057420.0, "step": 313 }, { "epoch": 2.0258899676375406, "grad_norm": 0.20466116070747375, "learning_rate": 5e-05, "loss": 0.2458, "num_tokens": 304035012.0, "step": 314 }, { "epoch": 2.0323624595469254, "grad_norm": 0.2566760182380676, "learning_rate": 5e-05, "loss": 0.2516, "num_tokens": 305020171.0, "step": 315 }, { "epoch": 2.0388349514563107, "grad_norm": 0.23247423768043518, "learning_rate": 5e-05, "loss": 0.2492, "num_tokens": 305997805.0, "step": 316 }, { "epoch": 2.045307443365696, "grad_norm": 0.21733851730823517, "learning_rate": 5e-05, "loss": 0.2483, "num_tokens": 306946495.0, "step": 317 }, { "epoch": 2.0517799352750807, "grad_norm": 0.2133636474609375, "learning_rate": 5e-05, "loss": 0.2501, "num_tokens": 307898930.0, "step": 318 }, { "epoch": 2.058252427184466, "grad_norm": 0.21891073882579803, "learning_rate": 5e-05, "loss": 0.2441, "num_tokens": 308853869.0, "step": 319 }, { "epoch": 2.0647249190938513, "grad_norm": 0.18485242128372192, "learning_rate": 5e-05, "loss": 0.2458, "num_tokens": 309835585.0, "step": 320 }, { "epoch": 2.071197411003236, "grad_norm": 0.19305238127708435, "learning_rate": 5e-05, "loss": 0.2423, "num_tokens": 310815130.0, "step": 321 }, { "epoch": 2.0776699029126213, "grad_norm": 0.23651911318302155, "learning_rate": 5e-05, "loss": 0.2524, "num_tokens": 311797693.0, "step": 322 }, { "epoch": 2.0841423948220066, "grad_norm": 0.19556501507759094, "learning_rate": 5e-05, "loss": 0.2445, "num_tokens": 312741860.0, "step": 323 }, { "epoch": 2.0906148867313914, "grad_norm": 0.24229426681995392, "learning_rate": 5e-05, "loss": 0.2473, "num_tokens": 313699114.0, "step": 324 }, { "epoch": 2.0970873786407767, "grad_norm": 0.2109650820493698, "learning_rate": 5e-05, "loss": 0.2484, "num_tokens": 314676353.0, "step": 325 }, { "epoch": 2.103559870550162, "grad_norm": 0.17984916269779205, "learning_rate": 5e-05, "loss": 0.2512, "num_tokens": 315619556.0, "step": 326 }, { "epoch": 2.1100323624595467, "grad_norm": 0.21856306493282318, "learning_rate": 5e-05, "loss": 0.251, "num_tokens": 316593419.0, "step": 327 }, { "epoch": 2.116504854368932, "grad_norm": 0.1917211264371872, "learning_rate": 5e-05, "loss": 0.2388, "num_tokens": 317586062.0, "step": 328 }, { "epoch": 2.1229773462783172, "grad_norm": 0.1996285766363144, "learning_rate": 5e-05, "loss": 0.2457, "num_tokens": 318557320.0, "step": 329 }, { "epoch": 2.129449838187702, "grad_norm": 0.20184561610221863, "learning_rate": 5e-05, "loss": 0.2463, "num_tokens": 319521712.0, "step": 330 }, { "epoch": 2.1359223300970873, "grad_norm": 0.2217031568288803, "learning_rate": 5e-05, "loss": 0.2507, "num_tokens": 320503248.0, "step": 331 }, { "epoch": 2.1423948220064726, "grad_norm": 0.24039798974990845, "learning_rate": 5e-05, "loss": 0.2511, "num_tokens": 321501454.0, "step": 332 }, { "epoch": 2.148867313915858, "grad_norm": 0.2121853083372116, "learning_rate": 5e-05, "loss": 0.2486, "num_tokens": 322456831.0, "step": 333 }, { "epoch": 2.1553398058252426, "grad_norm": 0.1870715618133545, "learning_rate": 5e-05, "loss": 0.2459, "num_tokens": 323415821.0, "step": 334 }, { "epoch": 2.161812297734628, "grad_norm": 0.19711925089359283, "learning_rate": 5e-05, "loss": 0.2496, "num_tokens": 324385718.0, "step": 335 }, { "epoch": 2.168284789644013, "grad_norm": 0.1879681944847107, "learning_rate": 5e-05, "loss": 0.2474, "num_tokens": 325330934.0, "step": 336 }, { "epoch": 2.174757281553398, "grad_norm": 0.17814786732196808, "learning_rate": 5e-05, "loss": 0.2478, "num_tokens": 326277294.0, "step": 337 }, { "epoch": 2.1812297734627832, "grad_norm": 0.19003082811832428, "learning_rate": 5e-05, "loss": 0.2443, "num_tokens": 327249393.0, "step": 338 }, { "epoch": 2.1877022653721685, "grad_norm": 0.2001635879278183, "learning_rate": 5e-05, "loss": 0.2485, "num_tokens": 328230895.0, "step": 339 }, { "epoch": 2.1941747572815533, "grad_norm": 0.20362168550491333, "learning_rate": 5e-05, "loss": 0.2494, "num_tokens": 329211471.0, "step": 340 }, { "epoch": 2.2006472491909386, "grad_norm": 0.22626088559627533, "learning_rate": 5e-05, "loss": 0.2502, "num_tokens": 330165352.0, "step": 341 }, { "epoch": 2.207119741100324, "grad_norm": 0.18977487087249756, "learning_rate": 5e-05, "loss": 0.2459, "num_tokens": 331112466.0, "step": 342 }, { "epoch": 2.2135922330097086, "grad_norm": 0.1859959214925766, "learning_rate": 5e-05, "loss": 0.2433, "num_tokens": 332076946.0, "step": 343 }, { "epoch": 2.220064724919094, "grad_norm": 0.17296473681926727, "learning_rate": 5e-05, "loss": 0.2442, "num_tokens": 333063256.0, "step": 344 }, { "epoch": 2.226537216828479, "grad_norm": 0.17386777698993683, "learning_rate": 5e-05, "loss": 0.252, "num_tokens": 334026829.0, "step": 345 }, { "epoch": 2.233009708737864, "grad_norm": 0.16889411211013794, "learning_rate": 5e-05, "loss": 0.2456, "num_tokens": 335023881.0, "step": 346 }, { "epoch": 2.2394822006472492, "grad_norm": 0.17656902968883514, "learning_rate": 5e-05, "loss": 0.2469, "num_tokens": 336010678.0, "step": 347 }, { "epoch": 2.2459546925566345, "grad_norm": 0.1883360743522644, "learning_rate": 5e-05, "loss": 0.2505, "num_tokens": 336975057.0, "step": 348 }, { "epoch": 2.2524271844660193, "grad_norm": 0.18347224593162537, "learning_rate": 5e-05, "loss": 0.2503, "num_tokens": 337918498.0, "step": 349 }, { "epoch": 2.2588996763754046, "grad_norm": 0.19474223256111145, "learning_rate": 5e-05, "loss": 0.2464, "num_tokens": 338909441.0, "step": 350 }, { "epoch": 2.26537216828479, "grad_norm": 0.21109187602996826, "learning_rate": 5e-05, "loss": 0.2528, "num_tokens": 339884565.0, "step": 351 }, { "epoch": 2.2718446601941746, "grad_norm": 0.21488125622272491, "learning_rate": 5e-05, "loss": 0.247, "num_tokens": 340849615.0, "step": 352 }, { "epoch": 2.27831715210356, "grad_norm": 0.21163029968738556, "learning_rate": 5e-05, "loss": 0.2532, "num_tokens": 341812130.0, "step": 353 }, { "epoch": 2.284789644012945, "grad_norm": 0.19011302292346954, "learning_rate": 5e-05, "loss": 0.2555, "num_tokens": 342798689.0, "step": 354 }, { "epoch": 2.29126213592233, "grad_norm": 0.16521933674812317, "learning_rate": 5e-05, "loss": 0.2512, "num_tokens": 343777378.0, "step": 355 }, { "epoch": 2.2977346278317152, "grad_norm": 0.1690284013748169, "learning_rate": 5e-05, "loss": 0.2546, "num_tokens": 344722430.0, "step": 356 }, { "epoch": 2.3042071197411005, "grad_norm": 0.16559071838855743, "learning_rate": 5e-05, "loss": 0.2481, "num_tokens": 345692509.0, "step": 357 }, { "epoch": 2.3106796116504853, "grad_norm": 0.16394446790218353, "learning_rate": 5e-05, "loss": 0.2491, "num_tokens": 346680167.0, "step": 358 }, { "epoch": 2.3171521035598706, "grad_norm": 0.1614151895046234, "learning_rate": 5e-05, "loss": 0.2558, "num_tokens": 347684225.0, "step": 359 }, { "epoch": 2.323624595469256, "grad_norm": 0.18255068361759186, "learning_rate": 5e-05, "loss": 0.2509, "num_tokens": 348650945.0, "step": 360 }, { "epoch": 2.3300970873786406, "grad_norm": 0.1942760944366455, "learning_rate": 5e-05, "loss": 0.2516, "num_tokens": 349609701.0, "step": 361 }, { "epoch": 2.336569579288026, "grad_norm": 0.19880688190460205, "learning_rate": 5e-05, "loss": 0.2523, "num_tokens": 350570434.0, "step": 362 }, { "epoch": 2.343042071197411, "grad_norm": 0.20108526945114136, "learning_rate": 5e-05, "loss": 0.254, "num_tokens": 351557109.0, "step": 363 }, { "epoch": 2.349514563106796, "grad_norm": 0.182880237698555, "learning_rate": 5e-05, "loss": 0.2563, "num_tokens": 352535505.0, "step": 364 }, { "epoch": 2.355987055016181, "grad_norm": 0.18121665716171265, "learning_rate": 5e-05, "loss": 0.2529, "num_tokens": 353532460.0, "step": 365 }, { "epoch": 2.3624595469255665, "grad_norm": 0.16655148565769196, "learning_rate": 5e-05, "loss": 0.255, "num_tokens": 354512411.0, "step": 366 }, { "epoch": 2.3689320388349513, "grad_norm": 0.1869032382965088, "learning_rate": 5e-05, "loss": 0.2533, "num_tokens": 355502656.0, "step": 367 }, { "epoch": 2.3754045307443366, "grad_norm": 0.19722098112106323, "learning_rate": 5e-05, "loss": 0.2506, "num_tokens": 356484097.0, "step": 368 }, { "epoch": 2.381877022653722, "grad_norm": 0.19324584305286407, "learning_rate": 5e-05, "loss": 0.2588, "num_tokens": 357461867.0, "step": 369 }, { "epoch": 2.3883495145631066, "grad_norm": 0.2080702930688858, "learning_rate": 5e-05, "loss": 0.2566, "num_tokens": 358424717.0, "step": 370 }, { "epoch": 2.394822006472492, "grad_norm": 0.18283598124980927, "learning_rate": 5e-05, "loss": 0.2569, "num_tokens": 359395193.0, "step": 371 }, { "epoch": 2.401294498381877, "grad_norm": 0.21475201845169067, "learning_rate": 5e-05, "loss": 0.2542, "num_tokens": 360370130.0, "step": 372 }, { "epoch": 2.407766990291262, "grad_norm": 0.21183514595031738, "learning_rate": 5e-05, "loss": 0.2558, "num_tokens": 361338296.0, "step": 373 }, { "epoch": 2.414239482200647, "grad_norm": 0.2067357450723648, "learning_rate": 5e-05, "loss": 0.2502, "num_tokens": 362303269.0, "step": 374 }, { "epoch": 2.4207119741100325, "grad_norm": 0.19343790411949158, "learning_rate": 5e-05, "loss": 0.2566, "num_tokens": 363269672.0, "step": 375 }, { "epoch": 2.4271844660194173, "grad_norm": 0.18862544000148773, "learning_rate": 5e-05, "loss": 0.2532, "num_tokens": 364215172.0, "step": 376 }, { "epoch": 2.4336569579288025, "grad_norm": 0.1895538717508316, "learning_rate": 5e-05, "loss": 0.2581, "num_tokens": 365199593.0, "step": 377 }, { "epoch": 2.440129449838188, "grad_norm": 0.1829875111579895, "learning_rate": 5e-05, "loss": 0.2533, "num_tokens": 366173715.0, "step": 378 }, { "epoch": 2.4466019417475726, "grad_norm": 0.18116118013858795, "learning_rate": 5e-05, "loss": 0.2555, "num_tokens": 367134817.0, "step": 379 }, { "epoch": 2.453074433656958, "grad_norm": 0.16913911700248718, "learning_rate": 5e-05, "loss": 0.2599, "num_tokens": 368117521.0, "step": 380 }, { "epoch": 2.459546925566343, "grad_norm": 0.17233212292194366, "learning_rate": 5e-05, "loss": 0.259, "num_tokens": 369068188.0, "step": 381 }, { "epoch": 2.466019417475728, "grad_norm": 0.17806458473205566, "learning_rate": 5e-05, "loss": 0.2527, "num_tokens": 370072565.0, "step": 382 }, { "epoch": 2.472491909385113, "grad_norm": 0.19235529005527496, "learning_rate": 5e-05, "loss": 0.2527, "num_tokens": 371036282.0, "step": 383 }, { "epoch": 2.4789644012944985, "grad_norm": 0.2100851833820343, "learning_rate": 5e-05, "loss": 0.2571, "num_tokens": 372022740.0, "step": 384 }, { "epoch": 2.4854368932038833, "grad_norm": 0.21587085723876953, "learning_rate": 5e-05, "loss": 0.2561, "num_tokens": 373000607.0, "step": 385 }, { "epoch": 2.4919093851132685, "grad_norm": 0.18687674403190613, "learning_rate": 5e-05, "loss": 0.2566, "num_tokens": 373986218.0, "step": 386 }, { "epoch": 2.498381877022654, "grad_norm": 0.17813560366630554, "learning_rate": 5e-05, "loss": 0.262, "num_tokens": 374968373.0, "step": 387 }, { "epoch": 2.5048543689320386, "grad_norm": 0.189350888133049, "learning_rate": 5e-05, "loss": 0.2623, "num_tokens": 375953242.0, "step": 388 }, { "epoch": 2.511326860841424, "grad_norm": 0.18417784571647644, "learning_rate": 5e-05, "loss": 0.2565, "num_tokens": 376930298.0, "step": 389 }, { "epoch": 2.517799352750809, "grad_norm": 0.1867777556180954, "learning_rate": 5e-05, "loss": 0.2516, "num_tokens": 377892697.0, "step": 390 }, { "epoch": 2.524271844660194, "grad_norm": 0.17608080804347992, "learning_rate": 5e-05, "loss": 0.2569, "num_tokens": 378851468.0, "step": 391 }, { "epoch": 2.530744336569579, "grad_norm": 0.17935039103031158, "learning_rate": 5e-05, "loss": 0.2539, "num_tokens": 379796295.0, "step": 392 }, { "epoch": 2.5372168284789645, "grad_norm": 0.17308013141155243, "learning_rate": 5e-05, "loss": 0.253, "num_tokens": 380749026.0, "step": 393 }, { "epoch": 2.5436893203883493, "grad_norm": 0.1875937283039093, "learning_rate": 5e-05, "loss": 0.2602, "num_tokens": 381732552.0, "step": 394 }, { "epoch": 2.5501618122977345, "grad_norm": 0.20413626730442047, "learning_rate": 5e-05, "loss": 0.2579, "num_tokens": 382685936.0, "step": 395 }, { "epoch": 2.55663430420712, "grad_norm": 0.21135841310024261, "learning_rate": 5e-05, "loss": 0.2583, "num_tokens": 383650180.0, "step": 396 }, { "epoch": 2.5631067961165046, "grad_norm": 0.17223158478736877, "learning_rate": 5e-05, "loss": 0.2599, "num_tokens": 384648157.0, "step": 397 }, { "epoch": 2.56957928802589, "grad_norm": 0.1730843186378479, "learning_rate": 5e-05, "loss": 0.2512, "num_tokens": 385633057.0, "step": 398 }, { "epoch": 2.576051779935275, "grad_norm": 0.23102404177188873, "learning_rate": 5e-05, "loss": 0.2605, "num_tokens": 386630696.0, "step": 399 }, { "epoch": 2.58252427184466, "grad_norm": 0.19044920802116394, "learning_rate": 5e-05, "loss": 0.2539, "num_tokens": 387613957.0, "step": 400 }, { "epoch": 2.588996763754045, "grad_norm": 0.16665557026863098, "learning_rate": 5e-05, "loss": 0.259, "num_tokens": 388558267.0, "step": 401 }, { "epoch": 2.5954692556634305, "grad_norm": 0.19409890472888947, "learning_rate": 5e-05, "loss": 0.2543, "num_tokens": 389538588.0, "step": 402 }, { "epoch": 2.6019417475728153, "grad_norm": 0.18489298224449158, "learning_rate": 5e-05, "loss": 0.2562, "num_tokens": 390511972.0, "step": 403 }, { "epoch": 2.6084142394822005, "grad_norm": 0.18002814054489136, "learning_rate": 5e-05, "loss": 0.2612, "num_tokens": 391448045.0, "step": 404 }, { "epoch": 2.614886731391586, "grad_norm": 0.1813887655735016, "learning_rate": 5e-05, "loss": 0.2569, "num_tokens": 392472093.0, "step": 405 }, { "epoch": 2.6213592233009706, "grad_norm": 0.15946455299854279, "learning_rate": 5e-05, "loss": 0.2566, "num_tokens": 393430332.0, "step": 406 }, { "epoch": 2.627831715210356, "grad_norm": 0.17943687736988068, "learning_rate": 5e-05, "loss": 0.2641, "num_tokens": 394429159.0, "step": 407 }, { "epoch": 2.634304207119741, "grad_norm": 0.17934055626392365, "learning_rate": 5e-05, "loss": 0.2572, "num_tokens": 395426297.0, "step": 408 }, { "epoch": 2.6407766990291264, "grad_norm": 0.1804719716310501, "learning_rate": 5e-05, "loss": 0.2592, "num_tokens": 396358492.0, "step": 409 }, { "epoch": 2.647249190938511, "grad_norm": 0.18092380464076996, "learning_rate": 5e-05, "loss": 0.253, "num_tokens": 397312727.0, "step": 410 }, { "epoch": 2.6537216828478964, "grad_norm": 0.17374356091022491, "learning_rate": 5e-05, "loss": 0.2533, "num_tokens": 398277677.0, "step": 411 }, { "epoch": 2.6601941747572817, "grad_norm": 0.16225680708885193, "learning_rate": 5e-05, "loss": 0.2586, "num_tokens": 399270732.0, "step": 412 }, { "epoch": 2.6666666666666665, "grad_norm": 0.1800750494003296, "learning_rate": 5e-05, "loss": 0.2572, "num_tokens": 400214224.0, "step": 413 }, { "epoch": 2.6731391585760518, "grad_norm": 0.17085859179496765, "learning_rate": 5e-05, "loss": 0.263, "num_tokens": 401197793.0, "step": 414 }, { "epoch": 2.679611650485437, "grad_norm": 0.19004404544830322, "learning_rate": 5e-05, "loss": 0.2614, "num_tokens": 402201191.0, "step": 415 }, { "epoch": 2.686084142394822, "grad_norm": 0.168423131108284, "learning_rate": 5e-05, "loss": 0.2627, "num_tokens": 403166232.0, "step": 416 }, { "epoch": 2.692556634304207, "grad_norm": 0.18137681484222412, "learning_rate": 5e-05, "loss": 0.2645, "num_tokens": 404128378.0, "step": 417 }, { "epoch": 2.6990291262135924, "grad_norm": 0.1873692125082016, "learning_rate": 5e-05, "loss": 0.2548, "num_tokens": 405053317.0, "step": 418 }, { "epoch": 2.705501618122977, "grad_norm": 0.16886006295681, "learning_rate": 5e-05, "loss": 0.2612, "num_tokens": 406034988.0, "step": 419 }, { "epoch": 2.7119741100323624, "grad_norm": 0.16958333551883698, "learning_rate": 5e-05, "loss": 0.2601, "num_tokens": 406979391.0, "step": 420 }, { "epoch": 2.7184466019417477, "grad_norm": 0.17350246012210846, "learning_rate": 5e-05, "loss": 0.2544, "num_tokens": 407949200.0, "step": 421 }, { "epoch": 2.724919093851133, "grad_norm": 0.1711425632238388, "learning_rate": 5e-05, "loss": 0.2642, "num_tokens": 408927182.0, "step": 422 }, { "epoch": 2.7313915857605178, "grad_norm": 0.1715414822101593, "learning_rate": 5e-05, "loss": 0.2602, "num_tokens": 409918480.0, "step": 423 }, { "epoch": 2.737864077669903, "grad_norm": 0.1698462963104248, "learning_rate": 5e-05, "loss": 0.2561, "num_tokens": 410898982.0, "step": 424 }, { "epoch": 2.7443365695792883, "grad_norm": 0.15912197530269623, "learning_rate": 5e-05, "loss": 0.2562, "num_tokens": 411875141.0, "step": 425 }, { "epoch": 2.750809061488673, "grad_norm": 0.1661800593137741, "learning_rate": 5e-05, "loss": 0.2595, "num_tokens": 412824562.0, "step": 426 }, { "epoch": 2.7572815533980584, "grad_norm": 0.1626822054386139, "learning_rate": 5e-05, "loss": 0.2578, "num_tokens": 413825079.0, "step": 427 }, { "epoch": 2.7637540453074436, "grad_norm": 0.16768640279769897, "learning_rate": 5e-05, "loss": 0.2609, "num_tokens": 414777217.0, "step": 428 }, { "epoch": 2.7702265372168284, "grad_norm": 0.17253726720809937, "learning_rate": 5e-05, "loss": 0.2629, "num_tokens": 415746492.0, "step": 429 }, { "epoch": 2.7766990291262137, "grad_norm": 0.16866855323314667, "learning_rate": 5e-05, "loss": 0.2573, "num_tokens": 416686095.0, "step": 430 }, { "epoch": 2.783171521035599, "grad_norm": 0.1762491762638092, "learning_rate": 5e-05, "loss": 0.2569, "num_tokens": 417634189.0, "step": 431 }, { "epoch": 2.7896440129449838, "grad_norm": 0.16174381971359253, "learning_rate": 5e-05, "loss": 0.262, "num_tokens": 418602339.0, "step": 432 }, { "epoch": 2.796116504854369, "grad_norm": 0.15794412791728973, "learning_rate": 5e-05, "loss": 0.2583, "num_tokens": 419563873.0, "step": 433 }, { "epoch": 2.8025889967637543, "grad_norm": 0.15204046666622162, "learning_rate": 5e-05, "loss": 0.2605, "num_tokens": 420562393.0, "step": 434 }, { "epoch": 2.809061488673139, "grad_norm": 0.1789230853319168, "learning_rate": 5e-05, "loss": 0.2613, "num_tokens": 421523768.0, "step": 435 }, { "epoch": 2.8155339805825244, "grad_norm": 0.17823684215545654, "learning_rate": 5e-05, "loss": 0.2605, "num_tokens": 422472099.0, "step": 436 }, { "epoch": 2.8220064724919096, "grad_norm": 0.16607439517974854, "learning_rate": 5e-05, "loss": 0.2633, "num_tokens": 423413417.0, "step": 437 }, { "epoch": 2.8284789644012944, "grad_norm": 0.1627141535282135, "learning_rate": 5e-05, "loss": 0.2573, "num_tokens": 424390973.0, "step": 438 }, { "epoch": 2.8349514563106797, "grad_norm": 0.1732834130525589, "learning_rate": 5e-05, "loss": 0.2671, "num_tokens": 425350702.0, "step": 439 }, { "epoch": 2.841423948220065, "grad_norm": 0.1753663271665573, "learning_rate": 5e-05, "loss": 0.2605, "num_tokens": 426331291.0, "step": 440 }, { "epoch": 2.8478964401294498, "grad_norm": 0.1652359813451767, "learning_rate": 5e-05, "loss": 0.2594, "num_tokens": 427296200.0, "step": 441 }, { "epoch": 2.854368932038835, "grad_norm": 0.17196263372898102, "learning_rate": 5e-05, "loss": 0.26, "num_tokens": 428296391.0, "step": 442 }, { "epoch": 2.8608414239482203, "grad_norm": 0.20355266332626343, "learning_rate": 5e-05, "loss": 0.2594, "num_tokens": 429272004.0, "step": 443 }, { "epoch": 2.867313915857605, "grad_norm": 0.23897817730903625, "learning_rate": 5e-05, "loss": 0.2626, "num_tokens": 430235969.0, "step": 444 }, { "epoch": 2.8737864077669903, "grad_norm": 0.18195684254169464, "learning_rate": 5e-05, "loss": 0.2604, "num_tokens": 431174501.0, "step": 445 }, { "epoch": 2.8802588996763756, "grad_norm": 0.1773492842912674, "learning_rate": 5e-05, "loss": 0.2646, "num_tokens": 432123098.0, "step": 446 }, { "epoch": 2.8867313915857604, "grad_norm": 0.1970640867948532, "learning_rate": 5e-05, "loss": 0.2692, "num_tokens": 433097167.0, "step": 447 }, { "epoch": 2.8932038834951457, "grad_norm": 0.23633213341236115, "learning_rate": 5e-05, "loss": 0.2682, "num_tokens": 434079373.0, "step": 448 }, { "epoch": 2.899676375404531, "grad_norm": 0.1687038540840149, "learning_rate": 5e-05, "loss": 0.2605, "num_tokens": 435065151.0, "step": 449 }, { "epoch": 2.9061488673139158, "grad_norm": 0.17610131204128265, "learning_rate": 5e-05, "loss": 0.2649, "num_tokens": 436063130.0, "step": 450 }, { "epoch": 2.912621359223301, "grad_norm": 0.20696723461151123, "learning_rate": 5e-05, "loss": 0.2595, "num_tokens": 437032520.0, "step": 451 }, { "epoch": 2.9190938511326863, "grad_norm": 0.17896310985088348, "learning_rate": 5e-05, "loss": 0.2671, "num_tokens": 437990354.0, "step": 452 }, { "epoch": 2.925566343042071, "grad_norm": 0.1712501496076584, "learning_rate": 5e-05, "loss": 0.2589, "num_tokens": 439009642.0, "step": 453 }, { "epoch": 2.9320388349514563, "grad_norm": 0.17318488657474518, "learning_rate": 5e-05, "loss": 0.2589, "num_tokens": 439985459.0, "step": 454 }, { "epoch": 2.9385113268608416, "grad_norm": 0.18446685373783112, "learning_rate": 5e-05, "loss": 0.2573, "num_tokens": 440955967.0, "step": 455 }, { "epoch": 2.9449838187702264, "grad_norm": 0.17319926619529724, "learning_rate": 5e-05, "loss": 0.2561, "num_tokens": 441912223.0, "step": 456 }, { "epoch": 2.9514563106796117, "grad_norm": 0.1783011555671692, "learning_rate": 5e-05, "loss": 0.2652, "num_tokens": 442916739.0, "step": 457 }, { "epoch": 2.957928802588997, "grad_norm": 0.17187394201755524, "learning_rate": 5e-05, "loss": 0.2566, "num_tokens": 443875303.0, "step": 458 }, { "epoch": 2.9644012944983817, "grad_norm": 0.16575132310390472, "learning_rate": 5e-05, "loss": 0.2647, "num_tokens": 444853831.0, "step": 459 }, { "epoch": 2.970873786407767, "grad_norm": 0.16645722091197968, "learning_rate": 5e-05, "loss": 0.261, "num_tokens": 445812377.0, "step": 460 }, { "epoch": 2.9773462783171523, "grad_norm": 0.18371130526065826, "learning_rate": 5e-05, "loss": 0.2655, "num_tokens": 446783660.0, "step": 461 }, { "epoch": 2.983818770226537, "grad_norm": 0.1664678156375885, "learning_rate": 5e-05, "loss": 0.2639, "num_tokens": 447813574.0, "step": 462 }, { "epoch": 2.983818770226537, "step": 462, "total_flos": 1.8506098969109594e+19, "train_loss": 0.3528567839300994, "train_runtime": 12007.34, "train_samples_per_second": 17.272, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 24, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8506098969109594e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }