[ { "loss": 2.7206878662109375, "grad_norm": 0.49609375, "learning_rate": 0.0, "entropy": 2.608895570039749, "num_tokens": 4649.0, "mean_token_accuracy": 0.462592713534832, "epoch": 0.03404255319148936, "step": 1 }, { "loss": 2.7071635723114014, "grad_norm": 0.46484375, "learning_rate": 2.0000000000000003e-06, "entropy": 2.545351803302765, "num_tokens": 9803.0, "mean_token_accuracy": 0.4683124031871557, "epoch": 0.06808510638297872, "step": 2 }, { "loss": 2.7231075763702393, "grad_norm": 0.474609375, "learning_rate": 4.000000000000001e-06, "entropy": 2.5835197120904922, "num_tokens": 14840.0, "mean_token_accuracy": 0.4721362516283989, "epoch": 0.10212765957446808, "step": 3 }, { "loss": 2.6703782081604004, "grad_norm": 0.47265625, "learning_rate": 6e-06, "entropy": 2.5479989051818848, "num_tokens": 19940.0, "mean_token_accuracy": 0.4756350498646498, "epoch": 0.13617021276595745, "step": 4 }, { "loss": 2.6016807556152344, "grad_norm": 0.482421875, "learning_rate": 8.000000000000001e-06, "entropy": 2.4778225421905518, "num_tokens": 24812.0, "mean_token_accuracy": 0.47054850682616234, "epoch": 0.1702127659574468, "step": 5 }, { "loss": 2.676024913787842, "grad_norm": 0.462890625, "learning_rate": 1e-05, "entropy": 2.5059917718172073, "num_tokens": 30009.0, "mean_token_accuracy": 0.47442322224378586, "epoch": 0.20425531914893616, "step": 6 }, { "loss": 2.6859350204467773, "grad_norm": 0.490234375, "learning_rate": 1.2e-05, "entropy": 2.5689117461442947, "num_tokens": 34795.0, "mean_token_accuracy": 0.46575408801436424, "epoch": 0.23829787234042554, "step": 7 }, { "loss": 2.6369540691375732, "grad_norm": 0.4765625, "learning_rate": 1.4000000000000001e-05, "entropy": 2.546092763543129, "num_tokens": 39689.0, "mean_token_accuracy": 0.47395625710487366, "epoch": 0.2723404255319149, "step": 8 }, { "loss": 2.6600987911224365, "grad_norm": 0.462890625, "learning_rate": 1.6000000000000003e-05, "entropy": 2.5432861000299454, "num_tokens": 44725.0, "mean_token_accuracy": 0.4628645218908787, "epoch": 0.30638297872340425, "step": 9 }, { "loss": 2.622265100479126, "grad_norm": 0.458984375, "learning_rate": 1.8e-05, "entropy": 2.4821661859750748, "num_tokens": 49906.0, "mean_token_accuracy": 0.4776103775948286, "epoch": 0.3404255319148936, "step": 10 }, { "loss": 2.7507314682006836, "grad_norm": 0.50390625, "learning_rate": 2e-05, "entropy": 2.6339701265096664, "num_tokens": 54413.0, "mean_token_accuracy": 0.44635788537561893, "epoch": 0.37446808510638296, "step": 11 }, { "loss": 2.628227710723877, "grad_norm": 0.462890625, "learning_rate": 2.2000000000000003e-05, "entropy": 2.5198942124843597, "num_tokens": 59209.0, "mean_token_accuracy": 0.4707046877592802, "epoch": 0.4085106382978723, "step": 12 }, { "loss": 2.611868143081665, "grad_norm": 0.45703125, "learning_rate": 2.4e-05, "entropy": 2.493274748325348, "num_tokens": 63993.0, "mean_token_accuracy": 0.4696641284972429, "epoch": 0.4425531914893617, "step": 13 }, { "loss": 2.6302144527435303, "grad_norm": 0.44140625, "learning_rate": 2.6000000000000002e-05, "entropy": 2.5304851830005646, "num_tokens": 69116.0, "mean_token_accuracy": 0.4738536272197962, "epoch": 0.4765957446808511, "step": 14 }, { "loss": 2.662710428237915, "grad_norm": 0.427734375, "learning_rate": 2.8000000000000003e-05, "entropy": 2.529924839735031, "num_tokens": 74375.0, "mean_token_accuracy": 0.4867155533283949, "epoch": 0.5106382978723404, "step": 15 }, { "loss": 2.612766742706299, "grad_norm": 0.435546875, "learning_rate": 3e-05, "entropy": 2.481363832950592, "num_tokens": 79435.0, "mean_token_accuracy": 0.474876606836915, "epoch": 0.5446808510638298, "step": 16 }, { "loss": 2.6860921382904053, "grad_norm": 0.427734375, "learning_rate": 3.2000000000000005e-05, "entropy": 2.5851253271102905, "num_tokens": 84468.0, "mean_token_accuracy": 0.46423414535820484, "epoch": 0.5787234042553191, "step": 17 }, { "loss": 2.657914400100708, "grad_norm": 0.427734375, "learning_rate": 3.4000000000000007e-05, "entropy": 2.53963340818882, "num_tokens": 89506.0, "mean_token_accuracy": 0.4672777969390154, "epoch": 0.6127659574468085, "step": 18 }, { "loss": 2.604454517364502, "grad_norm": 0.427734375, "learning_rate": 3.6e-05, "entropy": 2.4969288408756256, "num_tokens": 94386.0, "mean_token_accuracy": 0.477350901812315, "epoch": 0.6468085106382979, "step": 19 }, { "loss": 2.621829032897949, "grad_norm": 0.408203125, "learning_rate": 3.8e-05, "entropy": 2.481768637895584, "num_tokens": 99467.0, "mean_token_accuracy": 0.48110294714570045, "epoch": 0.6808510638297872, "step": 20 }, { "loss": 2.6175484657287598, "grad_norm": 0.41015625, "learning_rate": 4e-05, "entropy": 2.487631529569626, "num_tokens": 104449.0, "mean_token_accuracy": 0.4800560437142849, "epoch": 0.7148936170212766, "step": 21 }, { "loss": 2.672231674194336, "grad_norm": 0.404296875, "learning_rate": 4.2e-05, "entropy": 2.5790265798568726, "num_tokens": 109700.0, "mean_token_accuracy": 0.48193371295928955, "epoch": 0.7489361702127659, "step": 22 }, { "loss": 2.6545979976654053, "grad_norm": 0.39453125, "learning_rate": 4.4000000000000006e-05, "entropy": 2.531944215297699, "num_tokens": 114714.0, "mean_token_accuracy": 0.4792437721043825, "epoch": 0.7829787234042553, "step": 23 }, { "loss": 2.6065099239349365, "grad_norm": 0.40234375, "learning_rate": 4.600000000000001e-05, "entropy": 2.5392896085977554, "num_tokens": 119436.0, "mean_token_accuracy": 0.48113011196255684, "epoch": 0.8170212765957446, "step": 24 }, { "loss": 2.590505599975586, "grad_norm": 0.3828125, "learning_rate": 4.8e-05, "entropy": 2.5140435248613358, "num_tokens": 124375.0, "mean_token_accuracy": 0.4759677853435278, "epoch": 0.851063829787234, "step": 25 }, { "loss": 2.6808810234069824, "grad_norm": 0.408203125, "learning_rate": 5e-05, "entropy": 2.6100679636001587, "num_tokens": 128978.0, "mean_token_accuracy": 0.4715201146900654, "epoch": 0.8851063829787233, "step": 26 }, { "loss": 2.66715145111084, "grad_norm": 0.388671875, "learning_rate": 5.2000000000000004e-05, "entropy": 2.552435740828514, "num_tokens": 133774.0, "mean_token_accuracy": 0.47219102270901203, "epoch": 0.9191489361702128, "step": 27 }, { "loss": 2.709463119506836, "grad_norm": 0.388671875, "learning_rate": 5.4000000000000005e-05, "entropy": 2.5946313589811325, "num_tokens": 138692.0, "mean_token_accuracy": 0.46634298376739025, "epoch": 0.9531914893617022, "step": 28 }, { "loss": 2.6033380031585693, "grad_norm": 0.375, "learning_rate": 5.6000000000000006e-05, "entropy": 2.526190608739853, "num_tokens": 143830.0, "mean_token_accuracy": 0.4989198762923479, "epoch": 0.9872340425531915, "step": 29 }, { "loss": 2.5490007400512695, "grad_norm": 0.369140625, "learning_rate": 5.8e-05, "entropy": 2.4985697666803994, "num_tokens": 145794.0, "mean_token_accuracy": 0.4962264746427536, "epoch": 1.0, "step": 30 }, { "loss": 2.521101474761963, "grad_norm": 0.365234375, "learning_rate": 6e-05, "entropy": 2.4535591900348663, "num_tokens": 150798.0, "mean_token_accuracy": 0.49643513932824135, "epoch": 1.0340425531914894, "step": 31 }, { "loss": 2.554738759994507, "grad_norm": 0.3671875, "learning_rate": 6.2e-05, "entropy": 2.45968796312809, "num_tokens": 155646.0, "mean_token_accuracy": 0.4902253895998001, "epoch": 1.0680851063829788, "step": 32 }, { "loss": 2.5156683921813965, "grad_norm": 0.36328125, "learning_rate": 6.400000000000001e-05, "entropy": 2.461159363389015, "num_tokens": 160562.0, "mean_token_accuracy": 0.5049086343497038, "epoch": 1.102127659574468, "step": 33 }, { "loss": 2.5962746143341064, "grad_norm": 0.365234375, "learning_rate": 6.6e-05, "entropy": 2.5223591923713684, "num_tokens": 165453.0, "mean_token_accuracy": 0.48128395341336727, "epoch": 1.1361702127659574, "step": 34 }, { "loss": 2.595503568649292, "grad_norm": 0.35546875, "learning_rate": 6.800000000000001e-05, "entropy": 2.5031087547540665, "num_tokens": 170435.0, "mean_token_accuracy": 0.4835202544927597, "epoch": 1.1702127659574468, "step": 35 }, { "loss": 2.5437679290771484, "grad_norm": 0.341796875, "learning_rate": 7e-05, "entropy": 2.4591288417577744, "num_tokens": 175573.0, "mean_token_accuracy": 0.49856754019856453, "epoch": 1.2042553191489362, "step": 36 }, { "loss": 2.5545172691345215, "grad_norm": 0.333984375, "learning_rate": 7.2e-05, "entropy": 2.4627918899059296, "num_tokens": 180815.0, "mean_token_accuracy": 0.5012467484921217, "epoch": 1.2382978723404254, "step": 37 }, { "loss": 2.620225667953491, "grad_norm": 0.35546875, "learning_rate": 7.4e-05, "entropy": 2.536210775375366, "num_tokens": 185637.0, "mean_token_accuracy": 0.47896021977066994, "epoch": 1.2723404255319148, "step": 38 }, { "loss": 2.5529398918151855, "grad_norm": 0.357421875, "learning_rate": 7.6e-05, "entropy": 2.517217695713043, "num_tokens": 190481.0, "mean_token_accuracy": 0.4911583326756954, "epoch": 1.3063829787234043, "step": 39 }, { "loss": 2.575059413909912, "grad_norm": 0.3515625, "learning_rate": 7.800000000000001e-05, "entropy": 2.496584936976433, "num_tokens": 195446.0, "mean_token_accuracy": 0.4885687828063965, "epoch": 1.3404255319148937, "step": 40 }, { "loss": 2.5424654483795166, "grad_norm": 0.345703125, "learning_rate": 8e-05, "entropy": 2.4481908082962036, "num_tokens": 200608.0, "mean_token_accuracy": 0.49191189743578434, "epoch": 1.374468085106383, "step": 41 }, { "loss": 2.528459310531616, "grad_norm": 0.353515625, "learning_rate": 8.2e-05, "entropy": 2.486226588487625, "num_tokens": 205499.0, "mean_token_accuracy": 0.486592223867774, "epoch": 1.4085106382978723, "step": 42 }, { "loss": 2.4288268089294434, "grad_norm": 0.34765625, "learning_rate": 8.4e-05, "entropy": 2.4298370331525803, "num_tokens": 210526.0, "mean_token_accuracy": 0.5108798053115606, "epoch": 1.4425531914893617, "step": 43 }, { "loss": 2.451564073562622, "grad_norm": 0.357421875, "learning_rate": 8.6e-05, "entropy": 2.4269234240055084, "num_tokens": 215262.0, "mean_token_accuracy": 0.49717940017580986, "epoch": 1.476595744680851, "step": 44 }, { "loss": 2.5790700912475586, "grad_norm": 0.33984375, "learning_rate": 8.800000000000001e-05, "entropy": 2.5420146584510803, "num_tokens": 220397.0, "mean_token_accuracy": 0.48302956856787205, "epoch": 1.5106382978723403, "step": 45 }, { "loss": 2.5122318267822266, "grad_norm": 0.349609375, "learning_rate": 9e-05, "entropy": 2.4702245742082596, "num_tokens": 225223.0, "mean_token_accuracy": 0.4944263659417629, "epoch": 1.5446808510638297, "step": 46 }, { "loss": 2.467524528503418, "grad_norm": 0.35546875, "learning_rate": 9.200000000000001e-05, "entropy": 2.422152951359749, "num_tokens": 230038.0, "mean_token_accuracy": 0.4996390175074339, "epoch": 1.578723404255319, "step": 47 }, { "loss": 2.50884747505188, "grad_norm": 0.353515625, "learning_rate": 9.4e-05, "entropy": 2.48351514339447, "num_tokens": 234994.0, "mean_token_accuracy": 0.501384649425745, "epoch": 1.6127659574468085, "step": 48 }, { "loss": 2.500056266784668, "grad_norm": 0.345703125, "learning_rate": 9.6e-05, "entropy": 2.49152572453022, "num_tokens": 240081.0, "mean_token_accuracy": 0.5048438254743814, "epoch": 1.646808510638298, "step": 49 }, { "loss": 2.463620662689209, "grad_norm": 0.361328125, "learning_rate": 9.8e-05, "entropy": 2.412762477993965, "num_tokens": 244936.0, "mean_token_accuracy": 0.5132296048104763, "epoch": 1.6808510638297873, "step": 50 }, { "loss": 2.4155702590942383, "grad_norm": 0.34765625, "learning_rate": 0.0001, "entropy": 2.4134978353977203, "num_tokens": 249787.0, "mean_token_accuracy": 0.501561664044857, "epoch": 1.7148936170212767, "step": 51 }, { "loss": 2.4698283672332764, "grad_norm": 0.37109375, "learning_rate": 0.00010200000000000001, "entropy": 2.4626191705465317, "num_tokens": 254549.0, "mean_token_accuracy": 0.505357114598155, "epoch": 1.748936170212766, "step": 52 }, { "loss": 2.423790454864502, "grad_norm": 0.357421875, "learning_rate": 0.00010400000000000001, "entropy": 2.445830598473549, "num_tokens": 259281.0, "mean_token_accuracy": 0.5043601524084806, "epoch": 1.7829787234042553, "step": 53 }, { "loss": 2.4045891761779785, "grad_norm": 0.349609375, "learning_rate": 0.00010600000000000002, "entropy": 2.3872187584638596, "num_tokens": 264219.0, "mean_token_accuracy": 0.5117221903055906, "epoch": 1.8170212765957445, "step": 54 }, { "loss": 2.3898727893829346, "grad_norm": 0.328125, "learning_rate": 0.00010800000000000001, "entropy": 2.365436241030693, "num_tokens": 269423.0, "mean_token_accuracy": 0.5195568073540926, "epoch": 1.851063829787234, "step": 55 }, { "loss": 2.3509507179260254, "grad_norm": 0.328125, "learning_rate": 0.00011000000000000002, "entropy": 2.320594906806946, "num_tokens": 274667.0, "mean_token_accuracy": 0.5205415543168783, "epoch": 1.8851063829787233, "step": 56 }, { "loss": 2.4429311752319336, "grad_norm": 0.34375, "learning_rate": 0.00011200000000000001, "entropy": 2.3983265459537506, "num_tokens": 279633.0, "mean_token_accuracy": 0.5006663762032986, "epoch": 1.9191489361702128, "step": 57 }, { "loss": 2.378767967224121, "grad_norm": 0.333984375, "learning_rate": 0.00011399999999999999, "entropy": 2.3649341613054276, "num_tokens": 284855.0, "mean_token_accuracy": 0.5167245697230101, "epoch": 1.9531914893617022, "step": 58 }, { "loss": 2.2887539863586426, "grad_norm": 0.361328125, "learning_rate": 0.000116, "entropy": 2.3019338697195053, "num_tokens": 289663.0, "mean_token_accuracy": 0.5189696066081524, "epoch": 1.9872340425531916, "step": 59 }, { "loss": 2.3834011554718018, "grad_norm": 0.365234375, "learning_rate": 0.000118, "entropy": 2.393437663714091, "num_tokens": 291588.0, "mean_token_accuracy": 0.5125430872042974, "epoch": 2.0, "step": 60 }, { "loss": 2.250617027282715, "grad_norm": 0.3671875, "learning_rate": 0.00012, "entropy": 2.3306117355823517, "num_tokens": 296379.0, "mean_token_accuracy": 0.5301523543894291, "epoch": 2.0340425531914894, "step": 61 }, { "loss": 2.2675721645355225, "grad_norm": 0.361328125, "learning_rate": 0.000122, "entropy": 2.297535687685013, "num_tokens": 301335.0, "mean_token_accuracy": 0.5237707123160362, "epoch": 2.068085106382979, "step": 62 }, { "loss": 2.3007051944732666, "grad_norm": 0.34765625, "learning_rate": 0.000124, "entropy": 2.2993532568216324, "num_tokens": 306417.0, "mean_token_accuracy": 0.5290647000074387, "epoch": 2.1021276595744682, "step": 63 }, { "loss": 2.2428736686706543, "grad_norm": 0.34375, "learning_rate": 0.000126, "entropy": 2.275005057454109, "num_tokens": 311526.0, "mean_token_accuracy": 0.5337071865797043, "epoch": 2.1361702127659576, "step": 64 }, { "loss": 2.2673416137695312, "grad_norm": 0.37109375, "learning_rate": 0.00012800000000000002, "entropy": 2.3054451048374176, "num_tokens": 316376.0, "mean_token_accuracy": 0.5343158934265375, "epoch": 2.1702127659574466, "step": 65 }, { "loss": 2.1925814151763916, "grad_norm": 0.390625, "learning_rate": 0.00013000000000000002, "entropy": 2.2927923053503036, "num_tokens": 320981.0, "mean_token_accuracy": 0.5354686882346869, "epoch": 2.204255319148936, "step": 66 }, { "loss": 2.2779433727264404, "grad_norm": 0.36328125, "learning_rate": 0.000132, "entropy": 2.2592416927218437, "num_tokens": 326017.0, "mean_token_accuracy": 0.5330233704298735, "epoch": 2.2382978723404254, "step": 67 }, { "loss": 2.2277019023895264, "grad_norm": 0.376953125, "learning_rate": 0.000134, "entropy": 2.2629543989896774, "num_tokens": 330918.0, "mean_token_accuracy": 0.5323564410209656, "epoch": 2.272340425531915, "step": 68 }, { "loss": 2.187464952468872, "grad_norm": 0.376953125, "learning_rate": 0.00013600000000000003, "entropy": 2.2577197328209877, "num_tokens": 335768.0, "mean_token_accuracy": 0.5438785757869482, "epoch": 2.3063829787234043, "step": 69 }, { "loss": 2.136016368865967, "grad_norm": 0.3671875, "learning_rate": 0.000138, "entropy": 2.2293308824300766, "num_tokens": 340781.0, "mean_token_accuracy": 0.5530770476907492, "epoch": 2.3404255319148937, "step": 70 }, { "loss": 2.1845152378082275, "grad_norm": 0.353515625, "learning_rate": 0.00014, "entropy": 2.2304803282022476, "num_tokens": 345933.0, "mean_token_accuracy": 0.5472815595567226, "epoch": 2.374468085106383, "step": 71 }, { "loss": 2.1472551822662354, "grad_norm": 0.388671875, "learning_rate": 0.000142, "entropy": 2.2028725370764732, "num_tokens": 350686.0, "mean_token_accuracy": 0.5550102591514587, "epoch": 2.4085106382978725, "step": 72 }, { "loss": 2.059596538543701, "grad_norm": 0.3671875, "learning_rate": 0.000144, "entropy": 2.132790096104145, "num_tokens": 355527.0, "mean_token_accuracy": 0.5591119825839996, "epoch": 2.4425531914893615, "step": 73 }, { "loss": 2.0898165702819824, "grad_norm": 0.35546875, "learning_rate": 0.000146, "entropy": 2.168770805001259, "num_tokens": 360727.0, "mean_token_accuracy": 0.564784336835146, "epoch": 2.476595744680851, "step": 74 }, { "loss": 2.0821704864501953, "grad_norm": 0.365234375, "learning_rate": 0.000148, "entropy": 2.169573627412319, "num_tokens": 365989.0, "mean_token_accuracy": 0.5744843184947968, "epoch": 2.5106382978723403, "step": 75 }, { "loss": 2.0777361392974854, "grad_norm": 0.3828125, "learning_rate": 0.00015000000000000001, "entropy": 2.155258484184742, "num_tokens": 370843.0, "mean_token_accuracy": 0.571710079908371, "epoch": 2.5446808510638297, "step": 76 }, { "loss": 2.1683201789855957, "grad_norm": 0.3671875, "learning_rate": 0.000152, "entropy": 2.1525698825716972, "num_tokens": 376021.0, "mean_token_accuracy": 0.5686175748705864, "epoch": 2.578723404255319, "step": 77 }, { "loss": 1.9324456453323364, "grad_norm": 0.37890625, "learning_rate": 0.000154, "entropy": 2.0096444189548492, "num_tokens": 380765.0, "mean_token_accuracy": 0.598829809576273, "epoch": 2.6127659574468085, "step": 78 }, { "loss": 2.1014904975891113, "grad_norm": 0.41015625, "learning_rate": 0.00015600000000000002, "entropy": 2.168574407696724, "num_tokens": 385974.0, "mean_token_accuracy": 0.5878412388265133, "epoch": 2.646808510638298, "step": 79 }, { "loss": 2.0153093338012695, "grad_norm": 0.390625, "learning_rate": 0.00015800000000000002, "entropy": 2.028884157538414, "num_tokens": 391037.0, "mean_token_accuracy": 0.5932744853198528, "epoch": 2.6808510638297873, "step": 80 }, { "loss": 1.9153048992156982, "grad_norm": 0.396484375, "learning_rate": 0.00016, "entropy": 2.0838937163352966, "num_tokens": 395831.0, "mean_token_accuracy": 0.6091429851949215, "epoch": 2.7148936170212767, "step": 81 }, { "loss": 1.9109594821929932, "grad_norm": 0.412109375, "learning_rate": 0.000162, "entropy": 2.022364303469658, "num_tokens": 400672.0, "mean_token_accuracy": 0.6093469746410847, "epoch": 2.748936170212766, "step": 82 }, { "loss": 1.9070355892181396, "grad_norm": 0.3984375, "learning_rate": 0.000164, "entropy": 1.988129936158657, "num_tokens": 405474.0, "mean_token_accuracy": 0.6030600965023041, "epoch": 2.7829787234042556, "step": 83 }, { "loss": 1.8688206672668457, "grad_norm": 0.375, "learning_rate": 0.000166, "entropy": 1.9407341703772545, "num_tokens": 410563.0, "mean_token_accuracy": 0.6143141649663448, "epoch": 2.8170212765957445, "step": 84 }, { "loss": 1.8686532974243164, "grad_norm": 0.38671875, "learning_rate": 0.000168, "entropy": 1.9878689795732498, "num_tokens": 415701.0, "mean_token_accuracy": 0.6248090676963329, "epoch": 2.851063829787234, "step": 85 }, { "loss": 1.8615574836730957, "grad_norm": 0.392578125, "learning_rate": 0.00017, "entropy": 1.915215753018856, "num_tokens": 420683.0, "mean_token_accuracy": 0.6202812306582928, "epoch": 2.8851063829787233, "step": 86 }, { "loss": 1.8605434894561768, "grad_norm": 0.390625, "learning_rate": 0.000172, "entropy": 1.946378968656063, "num_tokens": 425957.0, "mean_token_accuracy": 0.6251695863902569, "epoch": 2.9191489361702128, "step": 87 }, { "loss": 1.7858905792236328, "grad_norm": 0.427734375, "learning_rate": 0.000174, "entropy": 1.8966291397809982, "num_tokens": 430663.0, "mean_token_accuracy": 0.640456885099411, "epoch": 2.953191489361702, "step": 88 }, { "loss": 1.734214186668396, "grad_norm": 0.435546875, "learning_rate": 0.00017600000000000002, "entropy": 1.8518416732549667, "num_tokens": 435395.0, "mean_token_accuracy": 0.6417186073958874, "epoch": 2.9872340425531916, "step": 89 }, { "loss": 1.8643648624420166, "grad_norm": 0.470703125, "learning_rate": 0.00017800000000000002, "entropy": 1.9248290061950684, "num_tokens": 437382.0, "mean_token_accuracy": 0.629046360651652, "epoch": 3.0, "step": 90 }, { "loss": 1.7444175481796265, "grad_norm": 0.447265625, "learning_rate": 0.00018, "entropy": 1.8195274099707603, "num_tokens": 442232.0, "mean_token_accuracy": 0.6317118927836418, "epoch": 3.0340425531914894, "step": 91 }, { "loss": 1.6900278329849243, "grad_norm": 0.43359375, "learning_rate": 0.000182, "entropy": 1.8135322630405426, "num_tokens": 447045.0, "mean_token_accuracy": 0.6387943811714649, "epoch": 3.068085106382979, "step": 92 }, { "loss": 1.6699750423431396, "grad_norm": 0.419921875, "learning_rate": 0.00018400000000000003, "entropy": 1.7506053671240807, "num_tokens": 452263.0, "mean_token_accuracy": 0.6522074639797211, "epoch": 3.1021276595744682, "step": 93 }, { "loss": 1.6945375204086304, "grad_norm": 0.44140625, "learning_rate": 0.00018600000000000002, "entropy": 1.7732647880911827, "num_tokens": 457063.0, "mean_token_accuracy": 0.640327800065279, "epoch": 3.1361702127659576, "step": 94 }, { "loss": 1.6494274139404297, "grad_norm": 0.447265625, "learning_rate": 0.000188, "entropy": 1.752028465270996, "num_tokens": 462035.0, "mean_token_accuracy": 0.6459747105836868, "epoch": 3.1702127659574466, "step": 95 }, { "loss": 1.6096299886703491, "grad_norm": 0.41796875, "learning_rate": 0.00019, "entropy": 1.695874534547329, "num_tokens": 467063.0, "mean_token_accuracy": 0.6539911031723022, "epoch": 3.204255319148936, "step": 96 }, { "loss": 1.5980371236801147, "grad_norm": 0.4375, "learning_rate": 0.000192, "entropy": 1.686888948082924, "num_tokens": 472049.0, "mean_token_accuracy": 0.6620035134255886, "epoch": 3.2382978723404254, "step": 97 }, { "loss": 1.7168352603912354, "grad_norm": 0.453125, "learning_rate": 0.000194, "entropy": 1.7836647480726242, "num_tokens": 477468.0, "mean_token_accuracy": 0.6486878879368305, "epoch": 3.272340425531915, "step": 98 }, { "loss": 1.4270864725112915, "grad_norm": 0.478515625, "learning_rate": 0.000196, "entropy": 1.618331864476204, "num_tokens": 481968.0, "mean_token_accuracy": 0.6884454749524593, "epoch": 3.3063829787234043, "step": 99 }, { "loss": 1.5164051055908203, "grad_norm": 0.44921875, "learning_rate": 0.00019800000000000002, "entropy": 1.6308004260063171, "num_tokens": 487055.0, "mean_token_accuracy": 0.6921050474047661, "epoch": 3.3404255319148937, "step": 100 }, { "loss": 1.5571681261062622, "grad_norm": 0.43359375, "learning_rate": 0.0002, "entropy": 1.7118290215730667, "num_tokens": 491932.0, "mean_token_accuracy": 0.6764726638793945, "epoch": 3.374468085106383, "step": 101 }, { "loss": 1.4636738300323486, "grad_norm": 0.4609375, "learning_rate": 0.00019980267284282717, "entropy": 1.524270586669445, "num_tokens": 496898.0, "mean_token_accuracy": 0.6990780048072338, "epoch": 3.4085106382978725, "step": 102 }, { "loss": 1.3923920392990112, "grad_norm": 0.4296875, "learning_rate": 0.0001992114701314478, "entropy": 1.5439333245158195, "num_tokens": 501626.0, "mean_token_accuracy": 0.7126500345766544, "epoch": 3.4425531914893615, "step": 103 }, { "loss": 1.4482853412628174, "grad_norm": 0.419921875, "learning_rate": 0.0001982287250728689, "entropy": 1.597928948700428, "num_tokens": 506533.0, "mean_token_accuracy": 0.6986983120441437, "epoch": 3.476595744680851, "step": 104 }, { "loss": 1.3720523118972778, "grad_norm": 0.37890625, "learning_rate": 0.0001968583161128631, "entropy": 1.5071553364396095, "num_tokens": 511455.0, "mean_token_accuracy": 0.7070708498358727, "epoch": 3.5106382978723403, "step": 105 }, { "loss": 1.5559459924697876, "grad_norm": 0.373046875, "learning_rate": 0.00019510565162951537, "entropy": 1.60520950704813, "num_tokens": 516324.0, "mean_token_accuracy": 0.6896940842270851, "epoch": 3.5446808510638297, "step": 106 }, { "loss": 1.4047073125839233, "grad_norm": 0.33203125, "learning_rate": 0.00019297764858882514, "entropy": 1.4945818409323692, "num_tokens": 521284.0, "mean_token_accuracy": 0.7080324850976467, "epoch": 3.578723404255319, "step": 107 }, { "loss": 1.4714158773422241, "grad_norm": 0.373046875, "learning_rate": 0.00019048270524660196, "entropy": 1.5630423054099083, "num_tokens": 526297.0, "mean_token_accuracy": 0.6965114735066891, "epoch": 3.6127659574468085, "step": 108 }, { "loss": 1.4059462547302246, "grad_norm": 0.349609375, "learning_rate": 0.00018763066800438636, "entropy": 1.4461402520537376, "num_tokens": 531450.0, "mean_token_accuracy": 0.7164609096944332, "epoch": 3.646808510638298, "step": 109 }, { "loss": 1.2749652862548828, "grad_norm": 0.318359375, "learning_rate": 0.00018443279255020152, "entropy": 1.3870351910591125, "num_tokens": 536000.0, "mean_token_accuracy": 0.7280398681759834, "epoch": 3.6808510638297873, "step": 110 }, { "loss": 1.4837267398834229, "grad_norm": 0.337890625, "learning_rate": 0.00018090169943749476, "entropy": 1.5375063568353653, "num_tokens": 541121.0, "mean_token_accuracy": 0.6994874440133572, "epoch": 3.7148936170212767, "step": 111 }, { "loss": 1.3818199634552002, "grad_norm": 0.3046875, "learning_rate": 0.00017705132427757895, "entropy": 1.4185625314712524, "num_tokens": 546194.0, "mean_token_accuracy": 0.7121663726866245, "epoch": 3.748936170212766, "step": 112 }, { "loss": 1.484237790107727, "grad_norm": 0.33203125, "learning_rate": 0.00017289686274214118, "entropy": 1.4590958431363106, "num_tokens": 550992.0, "mean_token_accuracy": 0.7153463698923588, "epoch": 3.7829787234042556, "step": 113 }, { "loss": 1.4489842653274536, "grad_norm": 0.310546875, "learning_rate": 0.00016845471059286887, "entropy": 1.5098442435264587, "num_tokens": 556324.0, "mean_token_accuracy": 0.7081326432526112, "epoch": 3.8170212765957445, "step": 114 }, { "loss": 1.4387658834457397, "grad_norm": 0.30078125, "learning_rate": 0.000163742398974869, "entropy": 1.3906443491578102, "num_tokens": 561234.0, "mean_token_accuracy": 0.7070117965340614, "epoch": 3.851063829787234, "step": 115 }, { "loss": 1.3244398832321167, "grad_norm": 0.251953125, "learning_rate": 0.00015877852522924732, "entropy": 1.3757080286741257, "num_tokens": 566041.0, "mean_token_accuracy": 0.7217641994357109, "epoch": 3.8851063829787233, "step": 116 }, { "loss": 1.423140525817871, "grad_norm": 0.29296875, "learning_rate": 0.00015358267949789966, "entropy": 1.419171568006277, "num_tokens": 570943.0, "mean_token_accuracy": 0.7150863148272038, "epoch": 3.9191489361702128, "step": 117 }, { "loss": 1.3995354175567627, "grad_norm": 0.248046875, "learning_rate": 0.00014817536741017152, "entropy": 1.4624414294958115, "num_tokens": 576068.0, "mean_token_accuracy": 0.7091907598078251, "epoch": 3.953191489361702, "step": 118 }, { "loss": 1.4066781997680664, "grad_norm": 0.27734375, "learning_rate": 0.00014257792915650728, "entropy": 1.4442569240927696, "num_tokens": 581375.0, "mean_token_accuracy": 0.7081452198326588, "epoch": 3.9872340425531916, "step": 119 }, { "loss": 1.323001742362976, "grad_norm": 0.40625, "learning_rate": 0.00013681245526846783, "entropy": 1.3550565044085185, "num_tokens": 583176.0, "mean_token_accuracy": 0.7349392573038737, "epoch": 4.0, "step": 120 }, { "loss": 1.2741748094558716, "grad_norm": 0.2392578125, "learning_rate": 0.00013090169943749476, "entropy": 1.3441371396183968, "num_tokens": 588029.0, "mean_token_accuracy": 0.730826698243618, "epoch": 4.034042553191489, "step": 121 }, { "loss": 1.3337856531143188, "grad_norm": 0.248046875, "learning_rate": 0.0001248689887164855, "entropy": 1.3350358568131924, "num_tokens": 593015.0, "mean_token_accuracy": 0.7424857430160046, "epoch": 4.068085106382979, "step": 122 }, { "loss": 1.3573651313781738, "grad_norm": 0.23828125, "learning_rate": 0.00011873813145857249, "entropy": 1.3992167189717293, "num_tokens": 598103.0, "mean_token_accuracy": 0.7057315893471241, "epoch": 4.102127659574468, "step": 123 }, { "loss": 1.3937177658081055, "grad_norm": 0.23046875, "learning_rate": 0.00011253332335643043, "entropy": 1.4113677814602852, "num_tokens": 603184.0, "mean_token_accuracy": 0.7157314494252205, "epoch": 4.136170212765958, "step": 124 }, { "loss": 1.3261609077453613, "grad_norm": 0.2421875, "learning_rate": 0.00010627905195293135, "entropy": 1.329230971634388, "num_tokens": 608043.0, "mean_token_accuracy": 0.7272443808615208, "epoch": 4.170212765957447, "step": 125 }, { "loss": 1.3460662364959717, "grad_norm": 0.2294921875, "learning_rate": 0.0001, "entropy": 1.4104024805128574, "num_tokens": 613118.0, "mean_token_accuracy": 0.7190378718078136, "epoch": 4.2042553191489365, "step": 126 }, { "loss": 1.2607674598693848, "grad_norm": 0.2294921875, "learning_rate": 9.372094804706867e-05, "entropy": 1.2900474704802036, "num_tokens": 617796.0, "mean_token_accuracy": 0.7468780763447285, "epoch": 4.238297872340426, "step": 127 }, { "loss": 1.3044146299362183, "grad_norm": 0.21875, "learning_rate": 8.746667664356956e-05, "entropy": 1.3943994268774986, "num_tokens": 622463.0, "mean_token_accuracy": 0.7257406860589981, "epoch": 4.272340425531915, "step": 128 }, { "loss": 1.403165340423584, "grad_norm": 0.2216796875, "learning_rate": 8.126186854142752e-05, "entropy": 1.416014552116394, "num_tokens": 627604.0, "mean_token_accuracy": 0.7186664901673794, "epoch": 4.306382978723404, "step": 129 }, { "loss": 1.3515689373016357, "grad_norm": 0.21875, "learning_rate": 7.513101128351454e-05, "entropy": 1.4118612408638, "num_tokens": 632472.0, "mean_token_accuracy": 0.7061808668076992, "epoch": 4.340425531914893, "step": 130 }, { "loss": 1.3877007961273193, "grad_norm": 0.2216796875, "learning_rate": 6.909830056250527e-05, "entropy": 1.442810334265232, "num_tokens": 637627.0, "mean_token_accuracy": 0.7081518247723579, "epoch": 4.374468085106383, "step": 131 }, { "loss": 1.3801257610321045, "grad_norm": 0.2353515625, "learning_rate": 6.318754473153221e-05, "entropy": 1.3938085660338402, "num_tokens": 642749.0, "mean_token_accuracy": 0.711228184401989, "epoch": 4.408510638297872, "step": 132 }, { "loss": 1.3554068803787231, "grad_norm": 0.2197265625, "learning_rate": 5.7422070843492734e-05, "entropy": 1.3671610057353973, "num_tokens": 647738.0, "mean_token_accuracy": 0.7216231897473335, "epoch": 4.4425531914893615, "step": 133 }, { "loss": 1.3064651489257812, "grad_norm": 0.2177734375, "learning_rate": 5.182463258982846e-05, "entropy": 1.373083382844925, "num_tokens": 652607.0, "mean_token_accuracy": 0.7263847254216671, "epoch": 4.476595744680851, "step": 134 }, { "loss": 1.3579915761947632, "grad_norm": 0.2080078125, "learning_rate": 4.6417320502100316e-05, "entropy": 1.3386466540396214, "num_tokens": 657442.0, "mean_token_accuracy": 0.7228063195943832, "epoch": 4.51063829787234, "step": 135 }, { "loss": 1.2104823589324951, "grad_norm": 0.2041015625, "learning_rate": 4.12214747707527e-05, "entropy": 1.2631035968661308, "num_tokens": 662309.0, "mean_token_accuracy": 0.7428977265954018, "epoch": 4.54468085106383, "step": 136 }, { "loss": 1.4721013307571411, "grad_norm": 0.2392578125, "learning_rate": 3.6257601025131026e-05, "entropy": 1.4593609496951103, "num_tokens": 667541.0, "mean_token_accuracy": 0.7006841897964478, "epoch": 4.578723404255319, "step": 137 }, { "loss": 1.3205115795135498, "grad_norm": 0.1904296875, "learning_rate": 3.154528940713113e-05, "entropy": 1.3411865159869194, "num_tokens": 672435.0, "mean_token_accuracy": 0.711910966783762, "epoch": 4.6127659574468085, "step": 138 }, { "loss": 1.3404732942581177, "grad_norm": 0.205078125, "learning_rate": 2.7103137257858868e-05, "entropy": 1.35780980437994, "num_tokens": 677587.0, "mean_token_accuracy": 0.7195054478943348, "epoch": 4.646808510638298, "step": 139 }, { "loss": 1.3404691219329834, "grad_norm": 0.2109375, "learning_rate": 2.2948675722421086e-05, "entropy": 1.385502077639103, "num_tokens": 682608.0, "mean_token_accuracy": 0.7179553732275963, "epoch": 4.680851063829787, "step": 140 }, { "loss": 1.298667073249817, "grad_norm": 0.2119140625, "learning_rate": 1.9098300562505266e-05, "entropy": 1.3394257836043835, "num_tokens": 687559.0, "mean_token_accuracy": 0.7157100811600685, "epoch": 4.714893617021277, "step": 141 }, { "loss": 1.357086181640625, "grad_norm": 0.216796875, "learning_rate": 1.5567207449798515e-05, "entropy": 1.392254188656807, "num_tokens": 692932.0, "mean_token_accuracy": 0.7176991924643517, "epoch": 4.748936170212766, "step": 142 }, { "loss": 1.1888355016708374, "grad_norm": 0.2021484375, "learning_rate": 1.2369331995613665e-05, "entropy": 1.2131473422050476, "num_tokens": 697459.0, "mean_token_accuracy": 0.7450261078774929, "epoch": 4.782978723404256, "step": 143 }, { "loss": 1.3256398439407349, "grad_norm": 0.20703125, "learning_rate": 9.517294753398064e-06, "entropy": 1.3851491175591946, "num_tokens": 702509.0, "mean_token_accuracy": 0.7178600430488586, "epoch": 4.817021276595745, "step": 144 }, { "loss": 1.314615249633789, "grad_norm": 0.205078125, "learning_rate": 7.022351411174866e-06, "entropy": 1.3774543926119804, "num_tokens": 707467.0, "mean_token_accuracy": 0.7143742069602013, "epoch": 4.851063829787234, "step": 145 }, { "loss": 1.2613680362701416, "grad_norm": 0.1982421875, "learning_rate": 4.8943483704846475e-06, "entropy": 1.2662791721522808, "num_tokens": 712309.0, "mean_token_accuracy": 0.7293516807258129, "epoch": 4.885106382978723, "step": 146 }, { "loss": 1.3638486862182617, "grad_norm": 0.2060546875, "learning_rate": 3.1416838871368924e-06, "entropy": 1.3923396654427052, "num_tokens": 717370.0, "mean_token_accuracy": 0.7183186002075672, "epoch": 4.919148936170213, "step": 147 }, { "loss": 1.3947694301605225, "grad_norm": 0.2138671875, "learning_rate": 1.771274927131139e-06, "entropy": 1.4238540306687355, "num_tokens": 722391.0, "mean_token_accuracy": 0.7114552594721317, "epoch": 4.953191489361702, "step": 148 }, { "loss": 1.3086700439453125, "grad_norm": 0.2109375, "learning_rate": 7.885298685522235e-07, "entropy": 1.3416940197348595, "num_tokens": 727232.0, "mean_token_accuracy": 0.7375914193689823, "epoch": 4.987234042553191, "step": 149 }, { "loss": 1.3020784854888916, "grad_norm": 0.333984375, "learning_rate": 1.973271571728441e-07, "entropy": 1.2878982325394948, "num_tokens": 728970.0, "mean_token_accuracy": 0.7272012829780579, "epoch": 5.0, "step": 150 }, { "train_runtime": 511.1784, "train_samples_per_second": 4.597, "train_steps_per_second": 0.293, "total_flos": 468545712433920.0, "train_loss": 2.0039055053393047, "epoch": 5.0, "step": 150 } ]