{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.1451612903225805, "eval_steps": 30, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008064516129032258, "grad_norm": NaN, "learning_rate": 0.0, "loss": 3.1612095832824707, "num_input_tokens_seen": 3376, "step": 1, "train_runtime": 54.6226, "train_tokens_per_second": 61.806 }, { "epoch": 0.016129032258064516, "grad_norm": 19.30766487121582, "learning_rate": 0.0, "loss": 3.09491229057312, "num_input_tokens_seen": 6750, "step": 2, "train_runtime": 59.7624, "train_tokens_per_second": 112.947 }, { "epoch": 0.024193548387096774, "grad_norm": 20.06780242919922, "learning_rate": 4e-05, "loss": 3.0951414108276367, "num_input_tokens_seen": 10132, "step": 3, "train_runtime": 63.3739, "train_tokens_per_second": 159.877 }, { "epoch": 0.03225806451612903, "grad_norm": 9.574868202209473, "learning_rate": 8e-05, "loss": 2.2402310371398926, "num_input_tokens_seen": 13496, "step": 4, "train_runtime": 67.0053, "train_tokens_per_second": 201.417 }, { "epoch": 0.04032258064516129, "grad_norm": 4.498556613922119, "learning_rate": 0.00012, "loss": 1.8392776250839233, "num_input_tokens_seen": 16978, "step": 5, "train_runtime": 70.7762, "train_tokens_per_second": 239.883 }, { "epoch": 0.04838709677419355, "grad_norm": 2.6655759811401367, "learning_rate": 0.00016, "loss": 1.4742008447647095, "num_input_tokens_seen": 20170, "step": 6, "train_runtime": 74.2843, "train_tokens_per_second": 271.524 }, { "epoch": 0.056451612903225805, "grad_norm": 2.0664868354797363, "learning_rate": 0.0002, "loss": 1.2891130447387695, "num_input_tokens_seen": 23566, "step": 7, "train_runtime": 77.9869, "train_tokens_per_second": 302.179 }, { "epoch": 0.06451612903225806, "grad_norm": 1.9103718996047974, "learning_rate": 0.00019999967645432384, "loss": 1.1510200500488281, "num_input_tokens_seen": 26930, "step": 8, "train_runtime": 81.6857, "train_tokens_per_second": 329.678 }, { "epoch": 0.07258064516129033, "grad_norm": 2.13531494140625, "learning_rate": 0.00019999870581938894, "loss": 1.1280962228775024, "num_input_tokens_seen": 30108, "step": 9, "train_runtime": 85.212, "train_tokens_per_second": 353.33 }, { "epoch": 0.08064516129032258, "grad_norm": 1.1911511421203613, "learning_rate": 0.0001999970881014762, "loss": 1.0238806009292603, "num_input_tokens_seen": 33254, "step": 10, "train_runtime": 88.7519, "train_tokens_per_second": 374.685 }, { "epoch": 0.08870967741935484, "grad_norm": 1.7620989084243774, "learning_rate": 0.00019999482331105377, "loss": 1.0168827772140503, "num_input_tokens_seen": 36458, "step": 11, "train_runtime": 92.3516, "train_tokens_per_second": 394.774 }, { "epoch": 0.0967741935483871, "grad_norm": 1.1830689907073975, "learning_rate": 0.0001999919114627769, "loss": 1.0140981674194336, "num_input_tokens_seen": 39848, "step": 12, "train_runtime": 96.1612, "train_tokens_per_second": 414.387 }, { "epoch": 0.10483870967741936, "grad_norm": 1.0954996347427368, "learning_rate": 0.00019998835257548786, "loss": 0.9716602563858032, "num_input_tokens_seen": 42964, "step": 13, "train_runtime": 99.7433, "train_tokens_per_second": 430.746 }, { "epoch": 0.11290322580645161, "grad_norm": 0.932753324508667, "learning_rate": 0.00019998414667221596, "loss": 0.9001954793930054, "num_input_tokens_seen": 46336, "step": 14, "train_runtime": 103.6049, "train_tokens_per_second": 447.237 }, { "epoch": 0.12096774193548387, "grad_norm": 0.8278332352638245, "learning_rate": 0.00019997929378017725, "loss": 0.9225928783416748, "num_input_tokens_seen": 49524, "step": 15, "train_runtime": 107.2795, "train_tokens_per_second": 461.635 }, { "epoch": 0.12903225806451613, "grad_norm": 0.7354539036750793, "learning_rate": 0.00019997379393077428, "loss": 0.8711258172988892, "num_input_tokens_seen": 52878, "step": 16, "train_runtime": 111.1447, "train_tokens_per_second": 475.758 }, { "epoch": 0.13709677419354838, "grad_norm": 0.7318344712257385, "learning_rate": 0.00019996764715959618, "loss": 0.799241840839386, "num_input_tokens_seen": 56342, "step": 17, "train_runtime": 115.127, "train_tokens_per_second": 489.39 }, { "epoch": 0.14516129032258066, "grad_norm": 0.774062991142273, "learning_rate": 0.0001999608535064182, "loss": 0.829187273979187, "num_input_tokens_seen": 59734, "step": 18, "train_runtime": 119.1867, "train_tokens_per_second": 501.18 }, { "epoch": 0.1532258064516129, "grad_norm": 0.8591275215148926, "learning_rate": 0.0001999534130152014, "loss": 0.8018933534622192, "num_input_tokens_seen": 63022, "step": 19, "train_runtime": 123.0444, "train_tokens_per_second": 512.189 }, { "epoch": 0.16129032258064516, "grad_norm": 0.9827930331230164, "learning_rate": 0.00019994532573409262, "loss": 0.8291414976119995, "num_input_tokens_seen": 66380, "step": 20, "train_runtime": 126.9621, "train_tokens_per_second": 522.833 }, { "epoch": 0.1693548387096774, "grad_norm": 0.9169955849647522, "learning_rate": 0.0001999365917154239, "loss": 0.8104863166809082, "num_input_tokens_seen": 69412, "step": 21, "train_runtime": 130.5539, "train_tokens_per_second": 531.673 }, { "epoch": 0.1774193548387097, "grad_norm": 0.8792040944099426, "learning_rate": 0.00019992721101571236, "loss": 0.8366917967796326, "num_input_tokens_seen": 72682, "step": 22, "train_runtime": 134.3459, "train_tokens_per_second": 541.007 }, { "epoch": 0.18548387096774194, "grad_norm": 0.8037822246551514, "learning_rate": 0.0001999171836956597, "loss": 0.8375218510627747, "num_input_tokens_seen": 75880, "step": 23, "train_runtime": 138.1243, "train_tokens_per_second": 549.36 }, { "epoch": 0.1935483870967742, "grad_norm": 0.6663984060287476, "learning_rate": 0.0001999065098201518, "loss": 0.7726024389266968, "num_input_tokens_seen": 79364, "step": 24, "train_runtime": 142.1732, "train_tokens_per_second": 558.22 }, { "epoch": 0.20161290322580644, "grad_norm": 0.6427092552185059, "learning_rate": 0.00019989518945825844, "loss": 0.7135709524154663, "num_input_tokens_seen": 82600, "step": 25, "train_runtime": 146.1185, "train_tokens_per_second": 565.294 }, { "epoch": 0.20967741935483872, "grad_norm": 0.7197744846343994, "learning_rate": 0.00019988322268323268, "loss": 0.7193007469177246, "num_input_tokens_seen": 85954, "step": 26, "train_runtime": 150.1399, "train_tokens_per_second": 572.493 }, { "epoch": 0.21774193548387097, "grad_norm": 0.6793457865715027, "learning_rate": 0.00019987060957251047, "loss": 0.7056538462638855, "num_input_tokens_seen": 89314, "step": 27, "train_runtime": 154.1923, "train_tokens_per_second": 579.238 }, { "epoch": 0.22580645161290322, "grad_norm": 0.7098727226257324, "learning_rate": 0.00019985735020771017, "loss": 0.7221028208732605, "num_input_tokens_seen": 92550, "step": 28, "train_runtime": 158.0575, "train_tokens_per_second": 585.546 }, { "epoch": 0.23387096774193547, "grad_norm": 0.656500518321991, "learning_rate": 0.00019984344467463197, "loss": 0.6559625864028931, "num_input_tokens_seen": 95904, "step": 29, "train_runtime": 162.0987, "train_tokens_per_second": 591.639 }, { "epoch": 0.24193548387096775, "grad_norm": 0.7039774060249329, "learning_rate": 0.0001998288930632574, "loss": 0.7168108224868774, "num_input_tokens_seen": 99314, "step": 30, "train_runtime": 166.2762, "train_tokens_per_second": 597.283 }, { "epoch": 0.24193548387096775, "eval_loss": 3.0846056938171387, "eval_runtime": 12.7508, "eval_samples_per_second": 4.078, "eval_steps_per_second": 2.039, "num_input_tokens_seen": 99314, "step": 30 }, { "epoch": 0.25, "grad_norm": 0.7343088984489441, "learning_rate": 0.00019981369546774865, "loss": 0.7140476107597351, "num_input_tokens_seen": 102632, "step": 31, "train_runtime": 188.002, "train_tokens_per_second": 545.909 }, { "epoch": 0.25806451612903225, "grad_norm": 0.8019713759422302, "learning_rate": 0.00019979785198644806, "loss": 0.7408339977264404, "num_input_tokens_seen": 106032, "step": 32, "train_runtime": 192.0827, "train_tokens_per_second": 552.012 }, { "epoch": 0.2661290322580645, "grad_norm": 0.728600025177002, "learning_rate": 0.00019978136272187747, "loss": 0.6582087278366089, "num_input_tokens_seen": 109374, "step": 33, "train_runtime": 196.1526, "train_tokens_per_second": 557.596 }, { "epoch": 0.27419354838709675, "grad_norm": 0.6822960376739502, "learning_rate": 0.0001997642277807374, "loss": 0.7177855372428894, "num_input_tokens_seen": 112780, "step": 34, "train_runtime": 200.2851, "train_tokens_per_second": 563.097 }, { "epoch": 0.28225806451612906, "grad_norm": 0.7499037384986877, "learning_rate": 0.00019974644727390665, "loss": 0.6647191047668457, "num_input_tokens_seen": 116154, "step": 35, "train_runtime": 204.4174, "train_tokens_per_second": 568.22 }, { "epoch": 0.2903225806451613, "grad_norm": 0.7178347110748291, "learning_rate": 0.00019972802131644127, "loss": 0.6842952370643616, "num_input_tokens_seen": 119350, "step": 36, "train_runtime": 208.3311, "train_tokens_per_second": 572.886 }, { "epoch": 0.29838709677419356, "grad_norm": 0.7969527244567871, "learning_rate": 0.00019970895002757413, "loss": 0.7260242700576782, "num_input_tokens_seen": 122566, "step": 37, "train_runtime": 212.2522, "train_tokens_per_second": 577.455 }, { "epoch": 0.3064516129032258, "grad_norm": 0.7407550811767578, "learning_rate": 0.00019968923353071377, "loss": 0.6808462738990784, "num_input_tokens_seen": 125906, "step": 38, "train_runtime": 216.3593, "train_tokens_per_second": 581.93 }, { "epoch": 0.31451612903225806, "grad_norm": 0.7005120515823364, "learning_rate": 0.00019966887195344403, "loss": 0.6249939799308777, "num_input_tokens_seen": 129178, "step": 39, "train_runtime": 220.3595, "train_tokens_per_second": 586.215 }, { "epoch": 0.3225806451612903, "grad_norm": 0.8344091773033142, "learning_rate": 0.0001996478654275229, "loss": 0.6812502145767212, "num_input_tokens_seen": 132242, "step": 40, "train_runtime": 224.1058, "train_tokens_per_second": 590.087 }, { "epoch": 0.33064516129032256, "grad_norm": 0.6059567332267761, "learning_rate": 0.00019962621408888177, "loss": 0.6382313966751099, "num_input_tokens_seen": 135598, "step": 41, "train_runtime": 228.1603, "train_tokens_per_second": 594.31 }, { "epoch": 0.3387096774193548, "grad_norm": 0.6195608377456665, "learning_rate": 0.00019960391807762463, "loss": 0.6314352750778198, "num_input_tokens_seen": 139036, "step": 42, "train_runtime": 232.2635, "train_tokens_per_second": 598.613 }, { "epoch": 0.3467741935483871, "grad_norm": 0.6370927095413208, "learning_rate": 0.00019958097753802693, "loss": 0.6387574672698975, "num_input_tokens_seen": 142314, "step": 43, "train_runtime": 236.252, "train_tokens_per_second": 602.382 }, { "epoch": 0.3548387096774194, "grad_norm": 0.6034757494926453, "learning_rate": 0.00019955739261853504, "loss": 0.6299672722816467, "num_input_tokens_seen": 145730, "step": 44, "train_runtime": 240.3213, "train_tokens_per_second": 606.396 }, { "epoch": 0.3629032258064516, "grad_norm": 0.6737009882926941, "learning_rate": 0.00019953316347176488, "loss": 0.5970463156700134, "num_input_tokens_seen": 148950, "step": 45, "train_runtime": 244.1671, "train_tokens_per_second": 610.033 }, { "epoch": 0.3709677419354839, "grad_norm": 0.6426769495010376, "learning_rate": 0.00019950829025450114, "loss": 0.647551417350769, "num_input_tokens_seen": 152208, "step": 46, "train_runtime": 248.1891, "train_tokens_per_second": 613.274 }, { "epoch": 0.3790322580645161, "grad_norm": 0.6058390736579895, "learning_rate": 0.0001994827731276963, "loss": 0.5269473791122437, "num_input_tokens_seen": 155494, "step": 47, "train_runtime": 252.1835, "train_tokens_per_second": 616.591 }, { "epoch": 0.3870967741935484, "grad_norm": 0.6453372836112976, "learning_rate": 0.00019945661225646946, "loss": 0.5612773895263672, "num_input_tokens_seen": 158822, "step": 48, "train_runtime": 256.2339, "train_tokens_per_second": 619.832 }, { "epoch": 0.3951612903225806, "grad_norm": 0.5935277342796326, "learning_rate": 0.0001994298078101054, "loss": 0.5867321491241455, "num_input_tokens_seen": 162234, "step": 49, "train_runtime": 260.2753, "train_tokens_per_second": 623.317 }, { "epoch": 0.4032258064516129, "grad_norm": 0.7631756663322449, "learning_rate": 0.00019940235996205333, "loss": 0.6878418922424316, "num_input_tokens_seen": 165674, "step": 50, "train_runtime": 264.4525, "train_tokens_per_second": 626.479 }, { "epoch": 0.4112903225806452, "grad_norm": 0.695699155330658, "learning_rate": 0.0001993742688899259, "loss": 0.5979565978050232, "num_input_tokens_seen": 169012, "step": 51, "train_runtime": 268.4933, "train_tokens_per_second": 629.483 }, { "epoch": 0.41935483870967744, "grad_norm": 0.7715777158737183, "learning_rate": 0.00019934553477549794, "loss": 0.596588134765625, "num_input_tokens_seen": 172396, "step": 52, "train_runtime": 272.5489, "train_tokens_per_second": 632.532 }, { "epoch": 0.4274193548387097, "grad_norm": 0.5751280784606934, "learning_rate": 0.00019931615780470558, "loss": 0.6130199432373047, "num_input_tokens_seen": 175766, "step": 53, "train_runtime": 276.6214, "train_tokens_per_second": 635.403 }, { "epoch": 0.43548387096774194, "grad_norm": 0.7077270150184631, "learning_rate": 0.00019928613816764458, "loss": 0.5735506415367126, "num_input_tokens_seen": 179034, "step": 54, "train_runtime": 280.6184, "train_tokens_per_second": 637.998 }, { "epoch": 0.4435483870967742, "grad_norm": 0.6344470381736755, "learning_rate": 0.00019925547605856934, "loss": 0.6677184700965881, "num_input_tokens_seen": 182380, "step": 55, "train_runtime": 284.6764, "train_tokens_per_second": 640.657 }, { "epoch": 0.45161290322580644, "grad_norm": 0.7267054915428162, "learning_rate": 0.00019922417167589183, "loss": 0.6288032531738281, "num_input_tokens_seen": 185574, "step": 56, "train_runtime": 288.5655, "train_tokens_per_second": 643.091 }, { "epoch": 0.4596774193548387, "grad_norm": 0.5684869885444641, "learning_rate": 0.00019919222522217996, "loss": 0.6335676908493042, "num_input_tokens_seen": 188970, "step": 57, "train_runtime": 292.6622, "train_tokens_per_second": 645.693 }, { "epoch": 0.46774193548387094, "grad_norm": 0.6018741726875305, "learning_rate": 0.00019915963690415647, "loss": 0.5417088270187378, "num_input_tokens_seen": 192282, "step": 58, "train_runtime": 296.681, "train_tokens_per_second": 648.11 }, { "epoch": 0.47580645161290325, "grad_norm": 0.6719103455543518, "learning_rate": 0.00019912640693269752, "loss": 0.5992182493209839, "num_input_tokens_seen": 195644, "step": 59, "train_runtime": 300.7533, "train_tokens_per_second": 650.513 }, { "epoch": 0.4838709677419355, "grad_norm": 0.657250165939331, "learning_rate": 0.00019909253552283143, "loss": 0.6172187328338623, "num_input_tokens_seen": 198956, "step": 60, "train_runtime": 304.8185, "train_tokens_per_second": 652.703 }, { "epoch": 0.4838709677419355, "eval_loss": 2.6003706455230713, "eval_runtime": 10.4131, "eval_samples_per_second": 4.994, "eval_steps_per_second": 2.497, "num_input_tokens_seen": 198956, "step": 60 }, { "epoch": 0.49193548387096775, "grad_norm": 0.6318468451499939, "learning_rate": 0.00019905802289373715, "loss": 0.5561873316764832, "num_input_tokens_seen": 202314, "step": 61, "train_runtime": 323.4182, "train_tokens_per_second": 625.549 }, { "epoch": 0.5, "grad_norm": 0.6109554171562195, "learning_rate": 0.0001990228692687429, "loss": 0.5585595369338989, "num_input_tokens_seen": 205722, "step": 62, "train_runtime": 327.4723, "train_tokens_per_second": 628.212 }, { "epoch": 0.5080645161290323, "grad_norm": 0.621971607208252, "learning_rate": 0.00019898707487532474, "loss": 0.6442801356315613, "num_input_tokens_seen": 208882, "step": 63, "train_runtime": 331.3395, "train_tokens_per_second": 630.417 }, { "epoch": 0.5161290322580645, "grad_norm": 0.5826683044433594, "learning_rate": 0.0001989506399451051, "loss": 0.575531542301178, "num_input_tokens_seen": 212198, "step": 64, "train_runtime": 335.3754, "train_tokens_per_second": 632.718 }, { "epoch": 0.5241935483870968, "grad_norm": 0.6495046615600586, "learning_rate": 0.0001989135647138513, "loss": 0.5925536751747131, "num_input_tokens_seen": 215586, "step": 65, "train_runtime": 339.5014, "train_tokens_per_second": 635.008 }, { "epoch": 0.532258064516129, "grad_norm": 0.7227226495742798, "learning_rate": 0.00019887584942147394, "loss": 0.6755293607711792, "num_input_tokens_seen": 218732, "step": 66, "train_runtime": 343.3709, "train_tokens_per_second": 637.014 }, { "epoch": 0.5403225806451613, "grad_norm": 0.6853323578834534, "learning_rate": 0.0001988374943120254, "loss": 0.6272655129432678, "num_input_tokens_seen": 221938, "step": 67, "train_runtime": 347.2586, "train_tokens_per_second": 639.114 }, { "epoch": 0.5483870967741935, "grad_norm": 0.5388026237487793, "learning_rate": 0.00019879849963369827, "loss": 0.6052734851837158, "num_input_tokens_seen": 225362, "step": 68, "train_runtime": 351.4187, "train_tokens_per_second": 641.292 }, { "epoch": 0.5564516129032258, "grad_norm": 0.5527206659317017, "learning_rate": 0.00019875886563882375, "loss": 0.5706397294998169, "num_input_tokens_seen": 228656, "step": 69, "train_runtime": 355.6332, "train_tokens_per_second": 642.955 }, { "epoch": 0.5645161290322581, "grad_norm": 0.5685147047042847, "learning_rate": 0.00019871859258387, "loss": 0.5797425508499146, "num_input_tokens_seen": 232052, "step": 70, "train_runtime": 359.7474, "train_tokens_per_second": 645.041 }, { "epoch": 0.5725806451612904, "grad_norm": 0.558627188205719, "learning_rate": 0.00019867768072944045, "loss": 0.5348191857337952, "num_input_tokens_seen": 235430, "step": 71, "train_runtime": 363.8094, "train_tokens_per_second": 647.125 }, { "epoch": 0.5806451612903226, "grad_norm": 0.6002105474472046, "learning_rate": 0.00019863613034027224, "loss": 0.5826228260993958, "num_input_tokens_seen": 238792, "step": 72, "train_runtime": 367.887, "train_tokens_per_second": 649.091 }, { "epoch": 0.5887096774193549, "grad_norm": 0.6451808214187622, "learning_rate": 0.0001985939416852343, "loss": 0.6138958930969238, "num_input_tokens_seen": 242136, "step": 73, "train_runtime": 371.9411, "train_tokens_per_second": 651.006 }, { "epoch": 0.5967741935483871, "grad_norm": 0.6479431986808777, "learning_rate": 0.00019855111503732574, "loss": 0.5971667170524597, "num_input_tokens_seen": 245502, "step": 74, "train_runtime": 375.9997, "train_tokens_per_second": 652.931 }, { "epoch": 0.6048387096774194, "grad_norm": 0.6030436754226685, "learning_rate": 0.00019850765067367412, "loss": 0.5875449776649475, "num_input_tokens_seen": 248724, "step": 75, "train_runtime": 379.8628, "train_tokens_per_second": 654.773 }, { "epoch": 0.6129032258064516, "grad_norm": 0.5927602648735046, "learning_rate": 0.00019846354887553358, "loss": 0.5731448531150818, "num_input_tokens_seen": 252068, "step": 76, "train_runtime": 383.878, "train_tokens_per_second": 656.636 }, { "epoch": 0.6209677419354839, "grad_norm": 0.6371795535087585, "learning_rate": 0.00019841880992828306, "loss": 0.5935379266738892, "num_input_tokens_seen": 255328, "step": 77, "train_runtime": 387.7359, "train_tokens_per_second": 658.51 }, { "epoch": 0.6290322580645161, "grad_norm": 0.5905424356460571, "learning_rate": 0.0001983734341214244, "loss": 0.5362335443496704, "num_input_tokens_seen": 258440, "step": 78, "train_runtime": 391.431, "train_tokens_per_second": 660.244 }, { "epoch": 0.6370967741935484, "grad_norm": 0.6883693337440491, "learning_rate": 0.00019832742174858052, "loss": 0.48908287286758423, "num_input_tokens_seen": 261670, "step": 79, "train_runtime": 395.3134, "train_tokens_per_second": 661.931 }, { "epoch": 0.6451612903225806, "grad_norm": 0.6175087094306946, "learning_rate": 0.0001982807731074935, "loss": 0.5709075331687927, "num_input_tokens_seen": 265042, "step": 80, "train_runtime": 399.3599, "train_tokens_per_second": 663.667 }, { "epoch": 0.6532258064516129, "grad_norm": 0.6919131875038147, "learning_rate": 0.00019823348850002268, "loss": 0.5524786710739136, "num_input_tokens_seen": 268396, "step": 81, "train_runtime": 403.3765, "train_tokens_per_second": 665.373 }, { "epoch": 0.6612903225806451, "grad_norm": 0.656579315662384, "learning_rate": 0.00019818556823214268, "loss": 0.5900183916091919, "num_input_tokens_seen": 271554, "step": 82, "train_runtime": 407.3032, "train_tokens_per_second": 666.712 }, { "epoch": 0.6693548387096774, "grad_norm": 0.7231733798980713, "learning_rate": 0.00019813701261394136, "loss": 0.6131920218467712, "num_input_tokens_seen": 274730, "step": 83, "train_runtime": 411.1381, "train_tokens_per_second": 668.218 }, { "epoch": 0.6774193548387096, "grad_norm": 0.5668199062347412, "learning_rate": 0.00019808782195961797, "loss": 0.5624396204948425, "num_input_tokens_seen": 278100, "step": 84, "train_runtime": 415.2265, "train_tokens_per_second": 669.755 }, { "epoch": 0.6854838709677419, "grad_norm": 0.5321037769317627, "learning_rate": 0.00019803799658748094, "loss": 0.512347936630249, "num_input_tokens_seen": 281468, "step": 85, "train_runtime": 419.2932, "train_tokens_per_second": 671.292 }, { "epoch": 0.6935483870967742, "grad_norm": 0.569511353969574, "learning_rate": 0.000197987536819946, "loss": 0.630191445350647, "num_input_tokens_seen": 284944, "step": 86, "train_runtime": 423.4433, "train_tokens_per_second": 672.921 }, { "epoch": 0.7016129032258065, "grad_norm": 0.5449895858764648, "learning_rate": 0.0001979364429835339, "loss": 0.511203944683075, "num_input_tokens_seen": 288252, "step": 87, "train_runtime": 427.4775, "train_tokens_per_second": 674.309 }, { "epoch": 0.7096774193548387, "grad_norm": 0.5464794635772705, "learning_rate": 0.00019788471540886844, "loss": 0.5040556192398071, "num_input_tokens_seen": 291674, "step": 88, "train_runtime": 431.587, "train_tokens_per_second": 675.817 }, { "epoch": 0.717741935483871, "grad_norm": 0.5279393792152405, "learning_rate": 0.0001978323544306743, "loss": 0.5020922422409058, "num_input_tokens_seen": 295054, "step": 89, "train_runtime": 435.6908, "train_tokens_per_second": 677.21 }, { "epoch": 0.7258064516129032, "grad_norm": 0.8044468760490417, "learning_rate": 0.00019777936038777483, "loss": 0.6041148900985718, "num_input_tokens_seen": 298162, "step": 90, "train_runtime": 439.5049, "train_tokens_per_second": 678.404 }, { "epoch": 0.7258064516129032, "eval_loss": 2.553819417953491, "eval_runtime": 10.3809, "eval_samples_per_second": 5.009, "eval_steps_per_second": 2.505, "num_input_tokens_seen": 298162, "step": 90 }, { "epoch": 0.7338709677419355, "grad_norm": 0.5703001618385315, "learning_rate": 0.0001977257336230899, "loss": 0.49885284900665283, "num_input_tokens_seen": 301500, "step": 91, "train_runtime": 458.0467, "train_tokens_per_second": 658.23 }, { "epoch": 0.7419354838709677, "grad_norm": 0.7216835021972656, "learning_rate": 0.00019767147448363366, "loss": 0.5518635511398315, "num_input_tokens_seen": 304624, "step": 92, "train_runtime": 461.8691, "train_tokens_per_second": 659.546 }, { "epoch": 0.75, "grad_norm": 0.6921418905258179, "learning_rate": 0.00019761658332051235, "loss": 0.575799822807312, "num_input_tokens_seen": 307956, "step": 93, "train_runtime": 465.9047, "train_tokens_per_second": 660.985 }, { "epoch": 0.7580645161290323, "grad_norm": 0.5977121591567993, "learning_rate": 0.00019756106048892186, "loss": 0.5771365165710449, "num_input_tokens_seen": 311354, "step": 94, "train_runtime": 470.0334, "train_tokens_per_second": 662.408 }, { "epoch": 0.7661290322580645, "grad_norm": 0.598276674747467, "learning_rate": 0.00019750490634814572, "loss": 0.5629148483276367, "num_input_tokens_seen": 314592, "step": 95, "train_runtime": 473.9733, "train_tokens_per_second": 663.734 }, { "epoch": 0.7741935483870968, "grad_norm": 0.5777994394302368, "learning_rate": 0.00019744812126155245, "loss": 0.5368872284889221, "num_input_tokens_seen": 317960, "step": 96, "train_runtime": 478.0702, "train_tokens_per_second": 665.091 }, { "epoch": 0.782258064516129, "grad_norm": 0.5279212594032288, "learning_rate": 0.00019739070559659347, "loss": 0.5620218515396118, "num_input_tokens_seen": 321274, "step": 97, "train_runtime": 482.1331, "train_tokens_per_second": 666.359 }, { "epoch": 0.7903225806451613, "grad_norm": 0.5498350858688354, "learning_rate": 0.0001973326597248006, "loss": 0.5708056092262268, "num_input_tokens_seen": 324480, "step": 98, "train_runtime": 486.061, "train_tokens_per_second": 667.571 }, { "epoch": 0.7983870967741935, "grad_norm": 0.5690199136734009, "learning_rate": 0.0001972739840217836, "loss": 0.49778610467910767, "num_input_tokens_seen": 327678, "step": 99, "train_runtime": 489.9181, "train_tokens_per_second": 668.842 }, { "epoch": 0.8064516129032258, "grad_norm": 0.5800296664237976, "learning_rate": 0.00019721467886722792, "loss": 0.5485215783119202, "num_input_tokens_seen": 330914, "step": 100, "train_runtime": 493.8199, "train_tokens_per_second": 670.111 }, { "epoch": 0.8145161290322581, "grad_norm": 0.6225796341896057, "learning_rate": 0.00019715474464489208, "loss": 0.5637421607971191, "num_input_tokens_seen": 334256, "step": 101, "train_runtime": 497.9641, "train_tokens_per_second": 671.245 }, { "epoch": 0.8225806451612904, "grad_norm": 0.5687134265899658, "learning_rate": 0.0001970941817426052, "loss": 0.5229646563529968, "num_input_tokens_seen": 337722, "step": 102, "train_runtime": 502.0618, "train_tokens_per_second": 672.67 }, { "epoch": 0.8306451612903226, "grad_norm": 0.5665640234947205, "learning_rate": 0.00019703299055226468, "loss": 0.5211078524589539, "num_input_tokens_seen": 341038, "step": 103, "train_runtime": 506.0768, "train_tokens_per_second": 673.886 }, { "epoch": 0.8387096774193549, "grad_norm": 0.6949163675308228, "learning_rate": 0.00019697117146983334, "loss": 0.5490686893463135, "num_input_tokens_seen": 344066, "step": 104, "train_runtime": 509.7166, "train_tokens_per_second": 675.014 }, { "epoch": 0.8467741935483871, "grad_norm": 0.5839992761611938, "learning_rate": 0.0001969087248953371, "loss": 0.592645525932312, "num_input_tokens_seen": 347352, "step": 105, "train_runtime": 513.6992, "train_tokens_per_second": 676.178 }, { "epoch": 0.8548387096774194, "grad_norm": 0.5174548029899597, "learning_rate": 0.00019684565123286244, "loss": 0.5339330434799194, "num_input_tokens_seen": 350654, "step": 106, "train_runtime": 517.6849, "train_tokens_per_second": 677.35 }, { "epoch": 0.8629032258064516, "grad_norm": 0.5480656027793884, "learning_rate": 0.00019678195089055346, "loss": 0.5675982236862183, "num_input_tokens_seen": 353954, "step": 107, "train_runtime": 521.5558, "train_tokens_per_second": 678.65 }, { "epoch": 0.8709677419354839, "grad_norm": 0.526905357837677, "learning_rate": 0.00019671762428060966, "loss": 0.5410143136978149, "num_input_tokens_seen": 357348, "step": 108, "train_runtime": 525.6225, "train_tokens_per_second": 679.857 }, { "epoch": 0.8790322580645161, "grad_norm": 0.5543181896209717, "learning_rate": 0.00019665267181928292, "loss": 0.5199689865112305, "num_input_tokens_seen": 360628, "step": 109, "train_runtime": 529.6075, "train_tokens_per_second": 680.934 }, { "epoch": 0.8870967741935484, "grad_norm": 0.4881467819213867, "learning_rate": 0.00019658709392687506, "loss": 0.5179097056388855, "num_input_tokens_seen": 364030, "step": 110, "train_runtime": 533.621, "train_tokens_per_second": 682.188 }, { "epoch": 0.8951612903225806, "grad_norm": 0.5893236398696899, "learning_rate": 0.00019652089102773488, "loss": 0.6043106317520142, "num_input_tokens_seen": 367360, "step": 111, "train_runtime": 537.6326, "train_tokens_per_second": 683.292 }, { "epoch": 0.9032258064516129, "grad_norm": 0.7594029903411865, "learning_rate": 0.00019645406355025565, "loss": 0.501505434513092, "num_input_tokens_seen": 370518, "step": 112, "train_runtime": 541.4395, "train_tokens_per_second": 684.32 }, { "epoch": 0.9112903225806451, "grad_norm": 0.5261175632476807, "learning_rate": 0.00019638661192687216, "loss": 0.503300666809082, "num_input_tokens_seen": 373828, "step": 113, "train_runtime": 545.4508, "train_tokens_per_second": 685.356 }, { "epoch": 0.9193548387096774, "grad_norm": 0.5733628869056702, "learning_rate": 0.00019631853659405807, "loss": 0.5247334241867065, "num_input_tokens_seen": 377226, "step": 114, "train_runtime": 549.523, "train_tokens_per_second": 686.461 }, { "epoch": 0.9274193548387096, "grad_norm": 0.5294317007064819, "learning_rate": 0.000196249837992323, "loss": 0.503804087638855, "num_input_tokens_seen": 380624, "step": 115, "train_runtime": 553.596, "train_tokens_per_second": 687.548 }, { "epoch": 0.9354838709677419, "grad_norm": 0.5543656349182129, "learning_rate": 0.0001961805165662096, "loss": 0.4710533916950226, "num_input_tokens_seen": 384012, "step": 116, "train_runtime": 557.6711, "train_tokens_per_second": 688.599 }, { "epoch": 0.9435483870967742, "grad_norm": 0.6034952402114868, "learning_rate": 0.00019611057276429085, "loss": 0.5092794299125671, "num_input_tokens_seen": 387364, "step": 117, "train_runtime": 561.7092, "train_tokens_per_second": 689.617 }, { "epoch": 0.9516129032258065, "grad_norm": 0.5937938690185547, "learning_rate": 0.00019604000703916705, "loss": 0.5077915191650391, "num_input_tokens_seen": 390740, "step": 118, "train_runtime": 565.7447, "train_tokens_per_second": 690.665 }, { "epoch": 0.9596774193548387, "grad_norm": 0.5722475051879883, "learning_rate": 0.00019596881984746287, "loss": 0.5252417325973511, "num_input_tokens_seen": 394002, "step": 119, "train_runtime": 569.7398, "train_tokens_per_second": 691.547 }, { "epoch": 0.967741935483871, "grad_norm": 0.6592346429824829, "learning_rate": 0.00019589701164982452, "loss": 0.5243955850601196, "num_input_tokens_seen": 397036, "step": 120, "train_runtime": 573.388, "train_tokens_per_second": 692.439 }, { "epoch": 0.967741935483871, "eval_loss": 2.329965114593506, "eval_runtime": 10.4196, "eval_samples_per_second": 4.991, "eval_steps_per_second": 2.495, "num_input_tokens_seen": 397036, "step": 120 }, { "epoch": 0.9758064516129032, "grad_norm": 0.6564823985099792, "learning_rate": 0.00019582458291091663, "loss": 0.605979859828949, "num_input_tokens_seen": 400216, "step": 121, "train_runtime": 591.5965, "train_tokens_per_second": 676.502 }, { "epoch": 0.9838709677419355, "grad_norm": 0.5364957451820374, "learning_rate": 0.0001957515340994193, "loss": 0.5199841260910034, "num_input_tokens_seen": 403522, "step": 122, "train_runtime": 595.5873, "train_tokens_per_second": 677.519 }, { "epoch": 0.9919354838709677, "grad_norm": 0.5787367820739746, "learning_rate": 0.000195677865688025, "loss": 0.5768128633499146, "num_input_tokens_seen": 406874, "step": 123, "train_runtime": 599.6315, "train_tokens_per_second": 678.54 }, { "epoch": 1.0, "grad_norm": 0.6326342821121216, "learning_rate": 0.00019560357815343577, "loss": 0.5327367782592773, "num_input_tokens_seen": 410114, "step": 124, "train_runtime": 603.6134, "train_tokens_per_second": 679.432 }, { "epoch": 1.0080645161290323, "grad_norm": 0.5442702174186707, "learning_rate": 0.00019552867197635974, "loss": 0.3973138928413391, "num_input_tokens_seen": 413264, "step": 125, "train_runtime": 607.4187, "train_tokens_per_second": 680.361 }, { "epoch": 1.0161290322580645, "grad_norm": 0.4810558259487152, "learning_rate": 0.00019545314764150837, "loss": 0.3840460777282715, "num_input_tokens_seen": 416432, "step": 126, "train_runtime": 611.2764, "train_tokens_per_second": 681.25 }, { "epoch": 1.0241935483870968, "grad_norm": 0.5415441989898682, "learning_rate": 0.00019537700563759304, "loss": 0.4207889437675476, "num_input_tokens_seen": 419814, "step": 127, "train_runtime": 615.3399, "train_tokens_per_second": 682.247 }, { "epoch": 1.032258064516129, "grad_norm": 0.5859441757202148, "learning_rate": 0.00019530024645732206, "loss": 0.4236603379249573, "num_input_tokens_seen": 423168, "step": 128, "train_runtime": 619.4394, "train_tokens_per_second": 683.147 }, { "epoch": 1.0403225806451613, "grad_norm": 0.6688714623451233, "learning_rate": 0.00019522287059739753, "loss": 0.446853905916214, "num_input_tokens_seen": 426248, "step": 129, "train_runtime": 623.1252, "train_tokens_per_second": 684.049 }, { "epoch": 1.0483870967741935, "grad_norm": 0.6689234972000122, "learning_rate": 0.00019514487855851184, "loss": 0.44393691420555115, "num_input_tokens_seen": 429726, "step": 130, "train_runtime": 627.2816, "train_tokens_per_second": 685.061 }, { "epoch": 1.0564516129032258, "grad_norm": 0.623140275478363, "learning_rate": 0.00019506627084534483, "loss": 0.44002050161361694, "num_input_tokens_seen": 432912, "step": 131, "train_runtime": 631.1917, "train_tokens_per_second": 685.865 }, { "epoch": 1.064516129032258, "grad_norm": 0.5681182146072388, "learning_rate": 0.00019498704796656018, "loss": 0.44711700081825256, "num_input_tokens_seen": 436246, "step": 132, "train_runtime": 635.2752, "train_tokens_per_second": 686.704 }, { "epoch": 1.0725806451612903, "grad_norm": 0.5604093670845032, "learning_rate": 0.00019490721043480226, "loss": 0.44671785831451416, "num_input_tokens_seen": 439446, "step": 133, "train_runtime": 639.1181, "train_tokens_per_second": 687.582 }, { "epoch": 1.0806451612903225, "grad_norm": 0.550987720489502, "learning_rate": 0.00019482675876669286, "loss": 0.41002413630485535, "num_input_tokens_seen": 442716, "step": 134, "train_runtime": 643.1023, "train_tokens_per_second": 688.407 }, { "epoch": 1.0887096774193548, "grad_norm": 0.48927298188209534, "learning_rate": 0.00019474569348282774, "loss": 0.4202808737754822, "num_input_tokens_seen": 445948, "step": 135, "train_runtime": 646.9886, "train_tokens_per_second": 689.267 }, { "epoch": 1.096774193548387, "grad_norm": 0.6476680040359497, "learning_rate": 0.0001946640151077734, "loss": 0.45923081040382385, "num_input_tokens_seen": 449172, "step": 136, "train_runtime": 650.8352, "train_tokens_per_second": 690.147 }, { "epoch": 1.1048387096774193, "grad_norm": 0.6251439452171326, "learning_rate": 0.00019458172417006347, "loss": 0.5086702108383179, "num_input_tokens_seen": 452512, "step": 137, "train_runtime": 654.8721, "train_tokens_per_second": 690.993 }, { "epoch": 1.1129032258064515, "grad_norm": 0.5364375114440918, "learning_rate": 0.00019449882120219555, "loss": 0.4321169853210449, "num_input_tokens_seen": 455922, "step": 138, "train_runtime": 658.9413, "train_tokens_per_second": 691.901 }, { "epoch": 1.120967741935484, "grad_norm": 0.5967617630958557, "learning_rate": 0.00019441530674062753, "loss": 0.4385676681995392, "num_input_tokens_seen": 459246, "step": 139, "train_runtime": 662.9535, "train_tokens_per_second": 692.727 }, { "epoch": 1.129032258064516, "grad_norm": 0.6668894290924072, "learning_rate": 0.0001943311813257743, "loss": 0.38102924823760986, "num_input_tokens_seen": 462644, "step": 140, "train_runtime": 667.0197, "train_tokens_per_second": 693.599 }, { "epoch": 1.1370967741935485, "grad_norm": 0.6523346900939941, "learning_rate": 0.00019424644550200415, "loss": 0.45807939767837524, "num_input_tokens_seen": 466004, "step": 141, "train_runtime": 671.0685, "train_tokens_per_second": 694.421 }, { "epoch": 1.1451612903225807, "grad_norm": 0.607889711856842, "learning_rate": 0.00019416109981763526, "loss": 0.4723294675350189, "num_input_tokens_seen": 469474, "step": 142, "train_runtime": 675.1673, "train_tokens_per_second": 695.345 }, { "epoch": 1.153225806451613, "grad_norm": 0.5858147740364075, "learning_rate": 0.00019407514482493214, "loss": 0.39813458919525146, "num_input_tokens_seen": 472830, "step": 143, "train_runtime": 679.2026, "train_tokens_per_second": 696.155 }, { "epoch": 1.1612903225806452, "grad_norm": 0.5359724164009094, "learning_rate": 0.00019398858108010217, "loss": 0.46676114201545715, "num_input_tokens_seen": 476146, "step": 144, "train_runtime": 683.2422, "train_tokens_per_second": 696.892 }, { "epoch": 1.1693548387096775, "grad_norm": 0.9037399888038635, "learning_rate": 0.0001939014091432918, "loss": 0.4816886782646179, "num_input_tokens_seen": 479388, "step": 145, "train_runtime": 687.1531, "train_tokens_per_second": 697.644 }, { "epoch": 1.1774193548387097, "grad_norm": 0.5351356267929077, "learning_rate": 0.00019381362957858312, "loss": 0.4616449475288391, "num_input_tokens_seen": 482820, "step": 146, "train_runtime": 691.2213, "train_tokens_per_second": 698.503 }, { "epoch": 1.185483870967742, "grad_norm": 0.5950395464897156, "learning_rate": 0.00019372524295399013, "loss": 0.4098345935344696, "num_input_tokens_seen": 485956, "step": 147, "train_runtime": 694.9967, "train_tokens_per_second": 699.221 }, { "epoch": 1.1935483870967742, "grad_norm": 0.496609091758728, "learning_rate": 0.00019363624984145502, "loss": 0.4433990716934204, "num_input_tokens_seen": 489266, "step": 148, "train_runtime": 698.9825, "train_tokens_per_second": 699.969 }, { "epoch": 1.2016129032258065, "grad_norm": 0.5592359304428101, "learning_rate": 0.00019354665081684446, "loss": 0.39385470747947693, "num_input_tokens_seen": 492712, "step": 149, "train_runtime": 703.0554, "train_tokens_per_second": 700.815 }, { "epoch": 1.2096774193548387, "grad_norm": 0.5236433744430542, "learning_rate": 0.0001934564464599461, "loss": 0.366371750831604, "num_input_tokens_seen": 496182, "step": 150, "train_runtime": 707.1645, "train_tokens_per_second": 701.65 }, { "epoch": 1.2096774193548387, "eval_loss": 2.3865628242492676, "eval_runtime": 10.3898, "eval_samples_per_second": 5.005, "eval_steps_per_second": 2.502, "num_input_tokens_seen": 496182, "step": 150 }, { "epoch": 1.217741935483871, "grad_norm": 0.6336263418197632, "learning_rate": 0.00019336563735446446, "loss": 0.46447595953941345, "num_input_tokens_seen": 499482, "step": 151, "train_runtime": 725.6025, "train_tokens_per_second": 688.369 }, { "epoch": 1.2258064516129032, "grad_norm": 0.6058643460273743, "learning_rate": 0.00019327422408801744, "loss": 0.4268086552619934, "num_input_tokens_seen": 502832, "step": 152, "train_runtime": 729.6116, "train_tokens_per_second": 689.178 }, { "epoch": 1.2338709677419355, "grad_norm": 0.5647494792938232, "learning_rate": 0.0001931822072521323, "loss": 0.4002935588359833, "num_input_tokens_seen": 506196, "step": 153, "train_runtime": 733.6971, "train_tokens_per_second": 689.925 }, { "epoch": 1.2419354838709677, "grad_norm": 0.6857813000679016, "learning_rate": 0.00019308958744224217, "loss": 0.41752880811691284, "num_input_tokens_seen": 509348, "step": 154, "train_runtime": 737.5425, "train_tokens_per_second": 690.602 }, { "epoch": 1.25, "grad_norm": 0.5805030465126038, "learning_rate": 0.00019299636525768173, "loss": 0.4200356602668762, "num_input_tokens_seen": 512728, "step": 155, "train_runtime": 741.6553, "train_tokens_per_second": 691.329 }, { "epoch": 1.2580645161290323, "grad_norm": 1.3187825679779053, "learning_rate": 0.00019290254130168374, "loss": 0.413343608379364, "num_input_tokens_seen": 516078, "step": 156, "train_runtime": 745.6977, "train_tokens_per_second": 692.074 }, { "epoch": 1.2661290322580645, "grad_norm": 0.7968809604644775, "learning_rate": 0.00019280811618137484, "loss": 0.4406548738479614, "num_input_tokens_seen": 519482, "step": 157, "train_runtime": 749.8044, "train_tokens_per_second": 692.823 }, { "epoch": 1.2741935483870968, "grad_norm": 0.5864307880401611, "learning_rate": 0.00019271309050777183, "loss": 0.44008421897888184, "num_input_tokens_seen": 522758, "step": 158, "train_runtime": 753.7142, "train_tokens_per_second": 693.576 }, { "epoch": 1.282258064516129, "grad_norm": 0.5749614238739014, "learning_rate": 0.00019261746489577765, "loss": 0.3923473656177521, "num_input_tokens_seen": 525784, "step": 159, "train_runtime": 757.3998, "train_tokens_per_second": 694.196 }, { "epoch": 1.2903225806451613, "grad_norm": 0.5030391216278076, "learning_rate": 0.00019252123996417738, "loss": 0.37407517433166504, "num_input_tokens_seen": 529130, "step": 160, "train_runtime": 761.4335, "train_tokens_per_second": 694.913 }, { "epoch": 1.2983870967741935, "grad_norm": 0.6000431180000305, "learning_rate": 0.00019242441633563417, "loss": 0.43879765272140503, "num_input_tokens_seen": 532308, "step": 161, "train_runtime": 765.2898, "train_tokens_per_second": 695.564 }, { "epoch": 1.3064516129032258, "grad_norm": 0.5326035022735596, "learning_rate": 0.00019232699463668542, "loss": 0.38204699754714966, "num_input_tokens_seen": 535662, "step": 162, "train_runtime": 769.3512, "train_tokens_per_second": 696.252 }, { "epoch": 1.314516129032258, "grad_norm": 0.6022024154663086, "learning_rate": 0.00019222897549773848, "loss": 0.4019436240196228, "num_input_tokens_seen": 539056, "step": 163, "train_runtime": 773.4493, "train_tokens_per_second": 696.951 }, { "epoch": 1.3225806451612903, "grad_norm": 0.5885249972343445, "learning_rate": 0.0001921303595530667, "loss": 0.39464306831359863, "num_input_tokens_seen": 542450, "step": 164, "train_runtime": 777.4896, "train_tokens_per_second": 697.694 }, { "epoch": 1.3306451612903225, "grad_norm": 0.7338830828666687, "learning_rate": 0.00019203114744080542, "loss": 0.47657448053359985, "num_input_tokens_seen": 545796, "step": 165, "train_runtime": 781.5136, "train_tokens_per_second": 698.383 }, { "epoch": 1.3387096774193548, "grad_norm": 0.6826189160346985, "learning_rate": 0.0001919313398029475, "loss": 0.4395520091056824, "num_input_tokens_seen": 548960, "step": 166, "train_runtime": 785.3294, "train_tokens_per_second": 699.019 }, { "epoch": 1.346774193548387, "grad_norm": 0.5975714325904846, "learning_rate": 0.00019183093728533966, "loss": 0.42339080572128296, "num_input_tokens_seen": 552310, "step": 167, "train_runtime": 789.3433, "train_tokens_per_second": 699.708 }, { "epoch": 1.3548387096774195, "grad_norm": 0.5949293375015259, "learning_rate": 0.00019172994053767784, "loss": 0.45283418893814087, "num_input_tokens_seen": 555660, "step": 168, "train_runtime": 793.3402, "train_tokens_per_second": 700.406 }, { "epoch": 1.3629032258064515, "grad_norm": 0.5816603302955627, "learning_rate": 0.0001916283502135033, "loss": 0.43116986751556396, "num_input_tokens_seen": 558942, "step": 169, "train_runtime": 797.3188, "train_tokens_per_second": 701.027 }, { "epoch": 1.370967741935484, "grad_norm": 0.6768733859062195, "learning_rate": 0.00019152616697019822, "loss": 0.465903103351593, "num_input_tokens_seen": 561914, "step": 170, "train_runtime": 800.9084, "train_tokens_per_second": 701.596 }, { "epoch": 1.379032258064516, "grad_norm": 0.5320206880569458, "learning_rate": 0.0001914233914689815, "loss": 0.4548630714416504, "num_input_tokens_seen": 565128, "step": 171, "train_runtime": 804.723, "train_tokens_per_second": 702.264 }, { "epoch": 1.3870967741935485, "grad_norm": 0.5698583126068115, "learning_rate": 0.00019132002437490458, "loss": 0.41211193799972534, "num_input_tokens_seen": 568492, "step": 172, "train_runtime": 808.7513, "train_tokens_per_second": 702.926 }, { "epoch": 1.3951612903225805, "grad_norm": 0.6660341620445251, "learning_rate": 0.00019121606635684696, "loss": 0.4983174800872803, "num_input_tokens_seen": 571838, "step": 173, "train_runtime": 812.784, "train_tokens_per_second": 703.555 }, { "epoch": 1.403225806451613, "grad_norm": 0.5148375630378723, "learning_rate": 0.00019111151808751196, "loss": 0.3650531768798828, "num_input_tokens_seen": 575172, "step": 174, "train_runtime": 816.8205, "train_tokens_per_second": 704.16 }, { "epoch": 1.4112903225806452, "grad_norm": 0.680073082447052, "learning_rate": 0.00019100638024342244, "loss": 0.4350316524505615, "num_input_tokens_seen": 578472, "step": 175, "train_runtime": 820.7995, "train_tokens_per_second": 704.767 }, { "epoch": 1.4193548387096775, "grad_norm": 0.5539473295211792, "learning_rate": 0.00019090065350491626, "loss": 0.4079291522502899, "num_input_tokens_seen": 581854, "step": 176, "train_runtime": 824.8486, "train_tokens_per_second": 705.407 }, { "epoch": 1.4274193548387097, "grad_norm": 0.6199740171432495, "learning_rate": 0.00019079433855614201, "loss": 0.4282647371292114, "num_input_tokens_seen": 585240, "step": 177, "train_runtime": 828.8684, "train_tokens_per_second": 706.071 }, { "epoch": 1.435483870967742, "grad_norm": 0.7518092393875122, "learning_rate": 0.00019068743608505455, "loss": 0.4538434147834778, "num_input_tokens_seen": 588466, "step": 178, "train_runtime": 832.705, "train_tokens_per_second": 706.692 }, { "epoch": 1.4435483870967742, "grad_norm": 0.6203386187553406, "learning_rate": 0.0001905799467834105, "loss": 0.4033154547214508, "num_input_tokens_seen": 591716, "step": 179, "train_runtime": 836.5606, "train_tokens_per_second": 707.32 }, { "epoch": 1.4516129032258065, "grad_norm": 0.5324673056602478, "learning_rate": 0.00019047187134676387, "loss": 0.40287455916404724, "num_input_tokens_seen": 595124, "step": 180, "train_runtime": 840.6426, "train_tokens_per_second": 707.939 }, { "epoch": 1.4516129032258065, "eval_loss": 2.244081735610962, "eval_runtime": 10.2876, "eval_samples_per_second": 5.055, "eval_steps_per_second": 2.527, "num_input_tokens_seen": 595124, "step": 180 }, { "epoch": 1.4596774193548387, "grad_norm": 0.5803776979446411, "learning_rate": 0.0001903632104744614, "loss": 0.40973353385925293, "num_input_tokens_seen": 598560, "step": 181, "train_runtime": 859.0393, "train_tokens_per_second": 696.778 }, { "epoch": 1.467741935483871, "grad_norm": 0.6523128747940063, "learning_rate": 0.00019025396486963827, "loss": 0.46758562326431274, "num_input_tokens_seen": 601832, "step": 182, "train_runtime": 862.9801, "train_tokens_per_second": 697.388 }, { "epoch": 1.4758064516129032, "grad_norm": 0.5893530249595642, "learning_rate": 0.0001901441352392133, "loss": 0.4586262106895447, "num_input_tokens_seen": 605124, "step": 183, "train_runtime": 866.9954, "train_tokens_per_second": 697.955 }, { "epoch": 1.4838709677419355, "grad_norm": 0.5453356504440308, "learning_rate": 0.00019003372229388452, "loss": 0.44551703333854675, "num_input_tokens_seen": 608550, "step": 184, "train_runtime": 871.1073, "train_tokens_per_second": 698.594 }, { "epoch": 1.4919354838709677, "grad_norm": 0.5781370997428894, "learning_rate": 0.0001899227267481246, "loss": 0.47110262513160706, "num_input_tokens_seen": 611912, "step": 185, "train_runtime": 875.213, "train_tokens_per_second": 699.158 }, { "epoch": 1.5, "grad_norm": 0.591170608997345, "learning_rate": 0.00018981114932017609, "loss": 0.4550918936729431, "num_input_tokens_seen": 615114, "step": 186, "train_runtime": 879.1905, "train_tokens_per_second": 699.637 }, { "epoch": 1.5080645161290323, "grad_norm": 0.5168197751045227, "learning_rate": 0.00018969899073204686, "loss": 0.39351943135261536, "num_input_tokens_seen": 618532, "step": 187, "train_runtime": 883.2978, "train_tokens_per_second": 700.253 }, { "epoch": 1.5161290322580645, "grad_norm": 0.6083711385726929, "learning_rate": 0.00018958625170950545, "loss": 0.43661871552467346, "num_input_tokens_seen": 621584, "step": 188, "train_runtime": 887.0132, "train_tokens_per_second": 700.761 }, { "epoch": 1.5241935483870968, "grad_norm": 0.5687974095344543, "learning_rate": 0.00018947293298207635, "loss": 0.45791560411453247, "num_input_tokens_seen": 624890, "step": 189, "train_runtime": 891.0431, "train_tokens_per_second": 701.302 }, { "epoch": 1.532258064516129, "grad_norm": 0.747931957244873, "learning_rate": 0.00018935903528303523, "loss": 0.491569459438324, "num_input_tokens_seen": 627898, "step": 190, "train_runtime": 894.7166, "train_tokens_per_second": 701.784 }, { "epoch": 1.5403225806451613, "grad_norm": 0.49939557909965515, "learning_rate": 0.0001892445593494042, "loss": 0.3926829695701599, "num_input_tokens_seen": 631260, "step": 191, "train_runtime": 898.7539, "train_tokens_per_second": 702.372 }, { "epoch": 1.5483870967741935, "grad_norm": 0.5808703303337097, "learning_rate": 0.0001891295059219472, "loss": 0.4172174334526062, "num_input_tokens_seen": 634598, "step": 192, "train_runtime": 902.7756, "train_tokens_per_second": 702.941 }, { "epoch": 1.5564516129032258, "grad_norm": 0.573276937007904, "learning_rate": 0.00018901387574516497, "loss": 0.4499248266220093, "num_input_tokens_seen": 637776, "step": 193, "train_runtime": 906.6301, "train_tokens_per_second": 703.458 }, { "epoch": 1.564516129032258, "grad_norm": 2.77301287651062, "learning_rate": 0.00018889766956729044, "loss": 0.43817731738090515, "num_input_tokens_seen": 641100, "step": 194, "train_runtime": 910.6441, "train_tokens_per_second": 704.007 }, { "epoch": 1.5725806451612905, "grad_norm": 0.5687347054481506, "learning_rate": 0.00018878088814028364, "loss": 0.41259127855300903, "num_input_tokens_seen": 644472, "step": 195, "train_runtime": 914.6988, "train_tokens_per_second": 704.573 }, { "epoch": 1.5806451612903225, "grad_norm": 0.5743884444236755, "learning_rate": 0.00018866353221982718, "loss": 0.371239572763443, "num_input_tokens_seen": 647738, "step": 196, "train_runtime": 918.7012, "train_tokens_per_second": 705.058 }, { "epoch": 1.588709677419355, "grad_norm": 0.5498597621917725, "learning_rate": 0.000188545602565321, "loss": 0.453549861907959, "num_input_tokens_seen": 651030, "step": 197, "train_runtime": 922.5884, "train_tokens_per_second": 705.656 }, { "epoch": 1.596774193548387, "grad_norm": 0.6513309478759766, "learning_rate": 0.00018842709993987776, "loss": 0.4566512107849121, "num_input_tokens_seen": 654362, "step": 198, "train_runtime": 926.5867, "train_tokens_per_second": 706.207 }, { "epoch": 1.6048387096774195, "grad_norm": 0.6124453544616699, "learning_rate": 0.00018830802511031762, "loss": 0.406619131565094, "num_input_tokens_seen": 657668, "step": 199, "train_runtime": 930.5442, "train_tokens_per_second": 706.756 }, { "epoch": 1.6129032258064515, "grad_norm": 0.5730791091918945, "learning_rate": 0.0001881883788471636, "loss": 0.4061928987503052, "num_input_tokens_seen": 661006, "step": 200, "train_runtime": 934.5634, "train_tokens_per_second": 707.289 }, { "epoch": 1.620967741935484, "grad_norm": 0.5406919121742249, "learning_rate": 0.00018806816192463625, "loss": 0.4146749973297119, "num_input_tokens_seen": 664466, "step": 201, "train_runtime": 938.6171, "train_tokens_per_second": 707.92 }, { "epoch": 1.629032258064516, "grad_norm": 0.6002207398414612, "learning_rate": 0.0001879473751206489, "loss": 0.4497522711753845, "num_input_tokens_seen": 667634, "step": 202, "train_runtime": 942.4299, "train_tokens_per_second": 708.418 }, { "epoch": 1.6370967741935485, "grad_norm": 0.5770663619041443, "learning_rate": 0.00018782601921680256, "loss": 0.4500475823879242, "num_input_tokens_seen": 671080, "step": 203, "train_runtime": 946.5233, "train_tokens_per_second": 708.995 }, { "epoch": 1.6451612903225805, "grad_norm": 0.6245407462120056, "learning_rate": 0.00018770409499838073, "loss": 0.49075624346733093, "num_input_tokens_seen": 674508, "step": 204, "train_runtime": 950.6315, "train_tokens_per_second": 709.537 }, { "epoch": 1.653225806451613, "grad_norm": 0.589747965335846, "learning_rate": 0.0001875816032543445, "loss": 0.4112730324268341, "num_input_tokens_seen": 677700, "step": 205, "train_runtime": 954.474, "train_tokens_per_second": 710.025 }, { "epoch": 1.661290322580645, "grad_norm": 0.5470958352088928, "learning_rate": 0.00018745854477732733, "loss": 0.42240893840789795, "num_input_tokens_seen": 680978, "step": 206, "train_runtime": 958.4736, "train_tokens_per_second": 710.482 }, { "epoch": 1.6693548387096775, "grad_norm": 0.5093806385993958, "learning_rate": 0.00018733492036363005, "loss": 0.40931662917137146, "num_input_tokens_seen": 684328, "step": 207, "train_runtime": 962.4879, "train_tokens_per_second": 710.999 }, { "epoch": 1.6774193548387095, "grad_norm": 0.6141051650047302, "learning_rate": 0.0001872107308132155, "loss": 0.4374140501022339, "num_input_tokens_seen": 687676, "step": 208, "train_runtime": 966.4877, "train_tokens_per_second": 711.521 }, { "epoch": 1.685483870967742, "grad_norm": 0.5096253156661987, "learning_rate": 0.00018708597692970353, "loss": 0.3743232488632202, "num_input_tokens_seen": 691000, "step": 209, "train_runtime": 970.5016, "train_tokens_per_second": 712.003 }, { "epoch": 1.6935483870967742, "grad_norm": 0.6781167984008789, "learning_rate": 0.00018696065952036571, "loss": 0.45937836170196533, "num_input_tokens_seen": 694156, "step": 210, "train_runtime": 974.3365, "train_tokens_per_second": 712.44 }, { "epoch": 1.6935483870967742, "eval_loss": 2.2753806114196777, "eval_runtime": 10.368, "eval_samples_per_second": 5.015, "eval_steps_per_second": 2.508, "num_input_tokens_seen": 694156, "step": 210 }, { "epoch": 1.7016129032258065, "grad_norm": 0.5530876517295837, "learning_rate": 0.00018683477939612021, "loss": 0.39176639914512634, "num_input_tokens_seen": 697510, "step": 211, "train_runtime": 992.7122, "train_tokens_per_second": 702.631 }, { "epoch": 1.7096774193548387, "grad_norm": 0.5276106595993042, "learning_rate": 0.0001867083373715264, "loss": 0.3990570604801178, "num_input_tokens_seen": 700884, "step": 212, "train_runtime": 996.7415, "train_tokens_per_second": 703.175 }, { "epoch": 1.717741935483871, "grad_norm": 0.6544950008392334, "learning_rate": 0.00018658133426477965, "loss": 0.477338969707489, "num_input_tokens_seen": 704288, "step": 213, "train_runtime": 1000.8249, "train_tokens_per_second": 703.708 }, { "epoch": 1.7258064516129032, "grad_norm": 0.6669942736625671, "learning_rate": 0.00018645377089770616, "loss": 0.41006964445114136, "num_input_tokens_seen": 707404, "step": 214, "train_runtime": 1004.6485, "train_tokens_per_second": 704.131 }, { "epoch": 1.7338709677419355, "grad_norm": 0.6127757430076599, "learning_rate": 0.00018632564809575742, "loss": 0.4169791340827942, "num_input_tokens_seen": 710738, "step": 215, "train_runtime": 1008.6929, "train_tokens_per_second": 704.613 }, { "epoch": 1.7419354838709677, "grad_norm": 0.6781405210494995, "learning_rate": 0.00018619696668800492, "loss": 0.49052947759628296, "num_input_tokens_seen": 714052, "step": 216, "train_runtime": 1012.7035, "train_tokens_per_second": 705.095 }, { "epoch": 1.75, "grad_norm": 0.5324527621269226, "learning_rate": 0.00018606772750713504, "loss": 0.4169653654098511, "num_input_tokens_seen": 717406, "step": 217, "train_runtime": 1016.803, "train_tokens_per_second": 705.551 }, { "epoch": 1.7580645161290323, "grad_norm": 0.5233943462371826, "learning_rate": 0.00018593793138944328, "loss": 0.4026474058628082, "num_input_tokens_seen": 720724, "step": 218, "train_runtime": 1020.857, "train_tokens_per_second": 705.999 }, { "epoch": 1.7661290322580645, "grad_norm": 0.5787035226821899, "learning_rate": 0.0001858075791748291, "loss": 0.501507043838501, "num_input_tokens_seen": 724156, "step": 219, "train_runtime": 1025.0039, "train_tokens_per_second": 706.491 }, { "epoch": 1.7741935483870968, "grad_norm": 0.5081259608268738, "learning_rate": 0.0001856766717067904, "loss": 0.41434332728385925, "num_input_tokens_seen": 727386, "step": 220, "train_runtime": 1029.0266, "train_tokens_per_second": 706.868 }, { "epoch": 1.782258064516129, "grad_norm": 0.4951300024986267, "learning_rate": 0.00018554520983241814, "loss": 0.4030165374279022, "num_input_tokens_seen": 730576, "step": 221, "train_runtime": 1032.9949, "train_tokens_per_second": 707.241 }, { "epoch": 1.7903225806451613, "grad_norm": 0.47789162397384644, "learning_rate": 0.00018541319440239066, "loss": 0.37602391839027405, "num_input_tokens_seen": 734042, "step": 222, "train_runtime": 1037.1175, "train_tokens_per_second": 707.771 }, { "epoch": 1.7983870967741935, "grad_norm": 0.5510605573654175, "learning_rate": 0.00018528062627096845, "loss": 0.4142032861709595, "num_input_tokens_seen": 737388, "step": 223, "train_runtime": 1041.1341, "train_tokens_per_second": 708.255 }, { "epoch": 1.8064516129032258, "grad_norm": 0.5054709315299988, "learning_rate": 0.0001851475062959884, "loss": 0.4083865284919739, "num_input_tokens_seen": 740714, "step": 224, "train_runtime": 1045.1999, "train_tokens_per_second": 708.682 }, { "epoch": 1.814516129032258, "grad_norm": 0.5532689690589905, "learning_rate": 0.00018501383533885837, "loss": 0.3750511407852173, "num_input_tokens_seen": 743952, "step": 225, "train_runtime": 1049.1088, "train_tokens_per_second": 709.128 }, { "epoch": 1.8225806451612905, "grad_norm": 0.5842207074165344, "learning_rate": 0.00018487961426455157, "loss": 0.4582732319831848, "num_input_tokens_seen": 747324, "step": 226, "train_runtime": 1053.1583, "train_tokens_per_second": 709.603 }, { "epoch": 1.8306451612903225, "grad_norm": 0.5713872313499451, "learning_rate": 0.0001847448439416009, "loss": 0.4612322449684143, "num_input_tokens_seen": 750660, "step": 227, "train_runtime": 1057.1876, "train_tokens_per_second": 710.054 }, { "epoch": 1.838709677419355, "grad_norm": 0.7017268538475037, "learning_rate": 0.00018460952524209355, "loss": 0.4732731580734253, "num_input_tokens_seen": 753890, "step": 228, "train_runtime": 1061.0082, "train_tokens_per_second": 710.541 }, { "epoch": 1.846774193548387, "grad_norm": 0.5345413684844971, "learning_rate": 0.0001844736590416651, "loss": 0.40334969758987427, "num_input_tokens_seen": 757196, "step": 229, "train_runtime": 1065.0157, "train_tokens_per_second": 710.972 }, { "epoch": 1.8548387096774195, "grad_norm": 0.5843684673309326, "learning_rate": 0.00018433724621949392, "loss": 0.4093519449234009, "num_input_tokens_seen": 760606, "step": 230, "train_runtime": 1069.0746, "train_tokens_per_second": 711.462 }, { "epoch": 1.8629032258064515, "grad_norm": 0.5555073022842407, "learning_rate": 0.00018420028765829568, "loss": 0.40939491987228394, "num_input_tokens_seen": 763910, "step": 231, "train_runtime": 1073.0585, "train_tokens_per_second": 711.9 }, { "epoch": 1.870967741935484, "grad_norm": 0.5539902448654175, "learning_rate": 0.00018406278424431736, "loss": 0.43555861711502075, "num_input_tokens_seen": 767166, "step": 232, "train_runtime": 1076.9274, "train_tokens_per_second": 712.366 }, { "epoch": 1.879032258064516, "grad_norm": 0.5120404958724976, "learning_rate": 0.00018392473686733163, "loss": 0.43676942586898804, "num_input_tokens_seen": 770592, "step": 233, "train_runtime": 1080.9922, "train_tokens_per_second": 712.856 }, { "epoch": 1.8870967741935485, "grad_norm": 0.6968815922737122, "learning_rate": 0.00018378614642063115, "loss": 0.4551546275615692, "num_input_tokens_seen": 773594, "step": 234, "train_runtime": 1084.6349, "train_tokens_per_second": 713.23 }, { "epoch": 1.8951612903225805, "grad_norm": 0.6711899638175964, "learning_rate": 0.00018364701380102266, "loss": 0.47881585359573364, "num_input_tokens_seen": 776592, "step": 235, "train_runtime": 1088.2536, "train_tokens_per_second": 713.613 }, { "epoch": 1.903225806451613, "grad_norm": 0.5762051939964294, "learning_rate": 0.0001835073399088214, "loss": 0.48081904649734497, "num_input_tokens_seen": 779942, "step": 236, "train_runtime": 1092.3842, "train_tokens_per_second": 713.981 }, { "epoch": 1.911290322580645, "grad_norm": 0.47804713249206543, "learning_rate": 0.00018336712564784503, "loss": 0.39908868074417114, "num_input_tokens_seen": 783252, "step": 237, "train_runtime": 1096.4305, "train_tokens_per_second": 714.365 }, { "epoch": 1.9193548387096775, "grad_norm": 0.6200618147850037, "learning_rate": 0.00018322637192540785, "loss": 0.4580332636833191, "num_input_tokens_seen": 786556, "step": 238, "train_runtime": 1100.4502, "train_tokens_per_second": 714.758 }, { "epoch": 1.9274193548387095, "grad_norm": 0.6390965580940247, "learning_rate": 0.00018308507965231508, "loss": 0.4557971954345703, "num_input_tokens_seen": 789882, "step": 239, "train_runtime": 1104.4776, "train_tokens_per_second": 715.163 }, { "epoch": 1.935483870967742, "grad_norm": 0.5260592699050903, "learning_rate": 0.00018294324974285677, "loss": 0.382771372795105, "num_input_tokens_seen": 793254, "step": 240, "train_runtime": 1108.5259, "train_tokens_per_second": 715.594 }, { "epoch": 1.935483870967742, "eval_loss": 2.1555724143981934, "eval_runtime": 10.4094, "eval_samples_per_second": 4.996, "eval_steps_per_second": 2.498, "num_input_tokens_seen": 793254, "step": 240 }, { "epoch": 1.9435483870967742, "grad_norm": 0.6225792169570923, "learning_rate": 0.00018280088311480201, "loss": 0.45386967062950134, "num_input_tokens_seen": 796604, "step": 241, "train_runtime": 1127.0971, "train_tokens_per_second": 706.775 }, { "epoch": 1.9516129032258065, "grad_norm": 0.5846142172813416, "learning_rate": 0.00018265798068939294, "loss": 0.4212104082107544, "num_input_tokens_seen": 799812, "step": 242, "train_runtime": 1131.0228, "train_tokens_per_second": 707.158 }, { "epoch": 1.9596774193548387, "grad_norm": 0.6712385416030884, "learning_rate": 0.0001825145433913388, "loss": 0.48295828700065613, "num_input_tokens_seen": 802998, "step": 243, "train_runtime": 1134.8725, "train_tokens_per_second": 707.567 }, { "epoch": 1.967741935483871, "grad_norm": 0.5474020838737488, "learning_rate": 0.00018237057214880994, "loss": 0.41326338052749634, "num_input_tokens_seen": 806314, "step": 244, "train_runtime": 1138.9273, "train_tokens_per_second": 707.959 }, { "epoch": 1.9758064516129032, "grad_norm": 0.5561951994895935, "learning_rate": 0.00018222606789343183, "loss": 0.4193962514400482, "num_input_tokens_seen": 809616, "step": 245, "train_runtime": 1142.9876, "train_tokens_per_second": 708.333 }, { "epoch": 1.9838709677419355, "grad_norm": 0.7057868242263794, "learning_rate": 0.00018208103156027897, "loss": 0.4625564515590668, "num_input_tokens_seen": 812656, "step": 246, "train_runtime": 1146.6931, "train_tokens_per_second": 708.695 }, { "epoch": 1.9919354838709677, "grad_norm": 0.5412559509277344, "learning_rate": 0.00018193546408786898, "loss": 0.3980866074562073, "num_input_tokens_seen": 815812, "step": 247, "train_runtime": 1150.55, "train_tokens_per_second": 709.063 }, { "epoch": 2.0, "grad_norm": 0.6223896145820618, "learning_rate": 0.00018178936641815636, "loss": 0.508427083492279, "num_input_tokens_seen": 818984, "step": 248, "train_runtime": 1154.4438, "train_tokens_per_second": 709.419 }, { "epoch": 2.0080645161290325, "grad_norm": 0.47228068113327026, "learning_rate": 0.0001816427394965265, "loss": 0.30645275115966797, "num_input_tokens_seen": 822146, "step": 249, "train_runtime": 1158.3569, "train_tokens_per_second": 709.752 }, { "epoch": 2.0161290322580645, "grad_norm": 0.45574134588241577, "learning_rate": 0.00018149558427178956, "loss": 0.2756215035915375, "num_input_tokens_seen": 825434, "step": 250, "train_runtime": 1162.3738, "train_tokens_per_second": 710.128 }, { "epoch": 2.024193548387097, "grad_norm": 0.5037111639976501, "learning_rate": 0.00018134790169617419, "loss": 0.26711219549179077, "num_input_tokens_seen": 828594, "step": 251, "train_runtime": 1166.2381, "train_tokens_per_second": 710.484 }, { "epoch": 2.032258064516129, "grad_norm": 0.468986839056015, "learning_rate": 0.00018119969272532166, "loss": 0.25423315167427063, "num_input_tokens_seen": 831870, "step": 252, "train_runtime": 1170.2506, "train_tokens_per_second": 710.848 }, { "epoch": 2.0403225806451615, "grad_norm": 0.554565966129303, "learning_rate": 0.00018105095831827934, "loss": 0.27069148421287537, "num_input_tokens_seen": 835230, "step": 253, "train_runtime": 1174.3065, "train_tokens_per_second": 711.254 }, { "epoch": 2.0483870967741935, "grad_norm": 0.7102289795875549, "learning_rate": 0.00018090169943749476, "loss": 0.30719172954559326, "num_input_tokens_seen": 838606, "step": 254, "train_runtime": 1178.3482, "train_tokens_per_second": 711.679 }, { "epoch": 2.056451612903226, "grad_norm": 0.9548736810684204, "learning_rate": 0.0001807519170488092, "loss": 0.2515634298324585, "num_input_tokens_seen": 841730, "step": 255, "train_runtime": 1182.2287, "train_tokens_per_second": 711.986 }, { "epoch": 2.064516129032258, "grad_norm": 0.6869332194328308, "learning_rate": 0.00018060161212145155, "loss": 0.2824576497077942, "num_input_tokens_seen": 844882, "step": 256, "train_runtime": 1186.0526, "train_tokens_per_second": 712.348 }, { "epoch": 2.0725806451612905, "grad_norm": 0.7332234382629395, "learning_rate": 0.00018045078562803203, "loss": 0.32276466488838196, "num_input_tokens_seen": 848268, "step": 257, "train_runtime": 1190.1082, "train_tokens_per_second": 712.765 }, { "epoch": 2.0806451612903225, "grad_norm": 0.6955188512802124, "learning_rate": 0.00018029943854453576, "loss": 0.28808388113975525, "num_input_tokens_seen": 851664, "step": 258, "train_runtime": 1194.279, "train_tokens_per_second": 713.12 }, { "epoch": 2.088709677419355, "grad_norm": 0.675642192363739, "learning_rate": 0.00018014757185031671, "loss": 0.302121639251709, "num_input_tokens_seen": 854890, "step": 259, "train_runtime": 1198.113, "train_tokens_per_second": 713.53 }, { "epoch": 2.096774193548387, "grad_norm": 0.6408395171165466, "learning_rate": 0.0001799951865280911, "loss": 0.2811819016933441, "num_input_tokens_seen": 858116, "step": 260, "train_runtime": 1202.0858, "train_tokens_per_second": 713.856 }, { "epoch": 2.1048387096774195, "grad_norm": 0.5794196128845215, "learning_rate": 0.00017984228356393117, "loss": 0.3123016059398651, "num_input_tokens_seen": 861472, "step": 261, "train_runtime": 1206.098, "train_tokens_per_second": 714.264 }, { "epoch": 2.1129032258064515, "grad_norm": 0.5573109984397888, "learning_rate": 0.00017968886394725874, "loss": 0.28222909569740295, "num_input_tokens_seen": 864828, "step": 262, "train_runtime": 1210.1334, "train_tokens_per_second": 714.655 }, { "epoch": 2.120967741935484, "grad_norm": 0.5452139377593994, "learning_rate": 0.00017953492867083895, "loss": 0.340986967086792, "num_input_tokens_seen": 868250, "step": 263, "train_runtime": 1214.227, "train_tokens_per_second": 715.064 }, { "epoch": 2.129032258064516, "grad_norm": 0.5423465967178345, "learning_rate": 0.00017938047873077362, "loss": 0.2829318940639496, "num_input_tokens_seen": 871588, "step": 264, "train_runtime": 1218.2485, "train_tokens_per_second": 715.444 }, { "epoch": 2.1370967741935485, "grad_norm": 0.5712020397186279, "learning_rate": 0.00017922551512649496, "loss": 0.29809385538101196, "num_input_tokens_seen": 874904, "step": 265, "train_runtime": 1222.2601, "train_tokens_per_second": 715.808 }, { "epoch": 2.1451612903225805, "grad_norm": 0.6137015223503113, "learning_rate": 0.00017907003886075904, "loss": 0.28243711590766907, "num_input_tokens_seen": 878270, "step": 266, "train_runtime": 1226.4186, "train_tokens_per_second": 716.126 }, { "epoch": 2.153225806451613, "grad_norm": 0.5994285345077515, "learning_rate": 0.00017891405093963938, "loss": 0.2684274911880493, "num_input_tokens_seen": 881702, "step": 267, "train_runtime": 1230.4755, "train_tokens_per_second": 716.554 }, { "epoch": 2.161290322580645, "grad_norm": 0.5856040120124817, "learning_rate": 0.00017875755237252027, "loss": 0.24499014019966125, "num_input_tokens_seen": 884996, "step": 268, "train_runtime": 1234.4668, "train_tokens_per_second": 716.905 }, { "epoch": 2.1693548387096775, "grad_norm": 0.6271276473999023, "learning_rate": 0.00017860054417209042, "loss": 0.2747422754764557, "num_input_tokens_seen": 888382, "step": 269, "train_runtime": 1238.4919, "train_tokens_per_second": 717.31 }, { "epoch": 2.1774193548387095, "grad_norm": 0.6944452524185181, "learning_rate": 0.00017844302735433635, "loss": 0.28950047492980957, "num_input_tokens_seen": 891756, "step": 270, "train_runtime": 1242.6132, "train_tokens_per_second": 717.646 }, { "epoch": 2.1774193548387095, "eval_loss": 2.4587552547454834, "eval_runtime": 10.3684, "eval_samples_per_second": 5.015, "eval_steps_per_second": 2.508, "num_input_tokens_seen": 891756, "step": 270 }, { "epoch": 2.185483870967742, "grad_norm": 0.7375330924987793, "learning_rate": 0.00017828500293853576, "loss": 0.2858275771141052, "num_input_tokens_seen": 895138, "step": 271, "train_runtime": 1261.0596, "train_tokens_per_second": 709.83 }, { "epoch": 2.193548387096774, "grad_norm": 0.7790223360061646, "learning_rate": 0.00017812647194725094, "loss": 0.26770544052124023, "num_input_tokens_seen": 898256, "step": 272, "train_runtime": 1264.8855, "train_tokens_per_second": 710.148 }, { "epoch": 2.2016129032258065, "grad_norm": 0.7440584301948547, "learning_rate": 0.00017796743540632223, "loss": 0.2846216857433319, "num_input_tokens_seen": 901604, "step": 273, "train_runtime": 1268.9522, "train_tokens_per_second": 710.511 }, { "epoch": 2.2096774193548385, "grad_norm": 0.6914224624633789, "learning_rate": 0.0001778078943448614, "loss": 0.3188799321651459, "num_input_tokens_seen": 905004, "step": 274, "train_runtime": 1273.0703, "train_tokens_per_second": 710.883 }, { "epoch": 2.217741935483871, "grad_norm": 0.6362150311470032, "learning_rate": 0.00017764784979524477, "loss": 0.28756844997406006, "num_input_tokens_seen": 908206, "step": 275, "train_runtime": 1277.0485, "train_tokens_per_second": 711.176 }, { "epoch": 2.225806451612903, "grad_norm": 0.5490068793296814, "learning_rate": 0.00017748730279310685, "loss": 0.25740480422973633, "num_input_tokens_seen": 911584, "step": 276, "train_runtime": 1281.1418, "train_tokens_per_second": 711.54 }, { "epoch": 2.2338709677419355, "grad_norm": 0.6280438303947449, "learning_rate": 0.00017732625437733335, "loss": 0.25927111506462097, "num_input_tokens_seen": 914828, "step": 277, "train_runtime": 1285.0944, "train_tokens_per_second": 711.876 }, { "epoch": 2.241935483870968, "grad_norm": 0.5633710622787476, "learning_rate": 0.00017716470559005473, "loss": 0.2778595983982086, "num_input_tokens_seen": 918148, "step": 278, "train_runtime": 1289.1238, "train_tokens_per_second": 712.226 }, { "epoch": 2.25, "grad_norm": 0.5184245109558105, "learning_rate": 0.0001770026574766391, "loss": 0.25256186723709106, "num_input_tokens_seen": 921458, "step": 279, "train_runtime": 1293.1464, "train_tokens_per_second": 712.571 }, { "epoch": 2.258064516129032, "grad_norm": 0.66209477186203, "learning_rate": 0.00017684011108568592, "loss": 0.31928277015686035, "num_input_tokens_seen": 924876, "step": 280, "train_runtime": 1297.2519, "train_tokens_per_second": 712.95 }, { "epoch": 2.2661290322580645, "grad_norm": 0.730171263217926, "learning_rate": 0.0001766770674690187, "loss": 0.3316803276538849, "num_input_tokens_seen": 928128, "step": 281, "train_runtime": 1301.1326, "train_tokens_per_second": 713.323 }, { "epoch": 2.274193548387097, "grad_norm": 0.5517908930778503, "learning_rate": 0.0001765135276816787, "loss": 0.2707135081291199, "num_input_tokens_seen": 931526, "step": 282, "train_runtime": 1305.2072, "train_tokens_per_second": 713.7 }, { "epoch": 2.282258064516129, "grad_norm": 0.6096500158309937, "learning_rate": 0.0001763494927819177, "loss": 0.2731682062149048, "num_input_tokens_seen": 934904, "step": 283, "train_runtime": 1309.2622, "train_tokens_per_second": 714.069 }, { "epoch": 2.2903225806451615, "grad_norm": 1.5883084535598755, "learning_rate": 0.00017618496383119128, "loss": 0.31949979066848755, "num_input_tokens_seen": 938330, "step": 284, "train_runtime": 1313.3041, "train_tokens_per_second": 714.48 }, { "epoch": 2.2983870967741935, "grad_norm": 0.6766570210456848, "learning_rate": 0.0001760199418941521, "loss": 0.34931379556655884, "num_input_tokens_seen": 941664, "step": 285, "train_runtime": 1317.3288, "train_tokens_per_second": 714.828 }, { "epoch": 2.306451612903226, "grad_norm": 0.5871296525001526, "learning_rate": 0.00017585442803864294, "loss": 0.28577709197998047, "num_input_tokens_seen": 945058, "step": 286, "train_runtime": 1321.3934, "train_tokens_per_second": 715.198 }, { "epoch": 2.314516129032258, "grad_norm": 0.6456281542778015, "learning_rate": 0.00017568842333568952, "loss": 0.29731327295303345, "num_input_tokens_seen": 948198, "step": 287, "train_runtime": 1325.1856, "train_tokens_per_second": 715.521 }, { "epoch": 2.3225806451612905, "grad_norm": 0.5836092829704285, "learning_rate": 0.00017552192885949395, "loss": 0.27603939175605774, "num_input_tokens_seen": 951480, "step": 288, "train_runtime": 1329.191, "train_tokens_per_second": 715.834 }, { "epoch": 2.3306451612903225, "grad_norm": 0.6601560711860657, "learning_rate": 0.0001753549456874276, "loss": 0.28083255887031555, "num_input_tokens_seen": 954742, "step": 289, "train_runtime": 1333.0323, "train_tokens_per_second": 716.218 }, { "epoch": 2.338709677419355, "grad_norm": 0.5849836468696594, "learning_rate": 0.00017518747490002413, "loss": 0.30605417490005493, "num_input_tokens_seen": 958120, "step": 290, "train_runtime": 1337.0733, "train_tokens_per_second": 716.58 }, { "epoch": 2.346774193548387, "grad_norm": 0.7856936454772949, "learning_rate": 0.00017501951758097257, "loss": 0.39218807220458984, "num_input_tokens_seen": 961532, "step": 291, "train_runtime": 1341.1305, "train_tokens_per_second": 716.956 }, { "epoch": 2.3548387096774195, "grad_norm": 0.6249871253967285, "learning_rate": 0.00017485107481711012, "loss": 0.24985340237617493, "num_input_tokens_seen": 964916, "step": 292, "train_runtime": 1345.2083, "train_tokens_per_second": 717.299 }, { "epoch": 2.3629032258064515, "grad_norm": 0.5427806377410889, "learning_rate": 0.0001746821476984154, "loss": 0.2381906509399414, "num_input_tokens_seen": 968250, "step": 293, "train_runtime": 1349.288, "train_tokens_per_second": 717.601 }, { "epoch": 2.370967741935484, "grad_norm": 0.5750318765640259, "learning_rate": 0.00017451273731800115, "loss": 0.2934698462486267, "num_input_tokens_seen": 971434, "step": 294, "train_runtime": 1353.1185, "train_tokens_per_second": 717.922 }, { "epoch": 2.379032258064516, "grad_norm": 0.6818315386772156, "learning_rate": 0.00017434284477210735, "loss": 0.3370276689529419, "num_input_tokens_seen": 974754, "step": 295, "train_runtime": 1357.1478, "train_tokens_per_second": 718.237 }, { "epoch": 2.3870967741935485, "grad_norm": 0.5582734942436218, "learning_rate": 0.00017417247116009388, "loss": 0.2613486349582672, "num_input_tokens_seen": 978032, "step": 296, "train_runtime": 1361.0424, "train_tokens_per_second": 718.59 }, { "epoch": 2.3951612903225805, "grad_norm": 0.6195712089538574, "learning_rate": 0.00017400161758443375, "loss": 0.29393696784973145, "num_input_tokens_seen": 981442, "step": 297, "train_runtime": 1365.1272, "train_tokens_per_second": 718.938 }, { "epoch": 2.403225806451613, "grad_norm": 0.716315507888794, "learning_rate": 0.0001738302851507056, "loss": 0.3132310211658478, "num_input_tokens_seen": 984794, "step": 298, "train_runtime": 1369.1701, "train_tokens_per_second": 719.263 }, { "epoch": 2.411290322580645, "grad_norm": 0.7157399654388428, "learning_rate": 0.00017365847496758684, "loss": 0.30963802337646484, "num_input_tokens_seen": 988146, "step": 299, "train_runtime": 1373.2211, "train_tokens_per_second": 719.583 }, { "epoch": 2.4193548387096775, "grad_norm": 0.5889755487442017, "learning_rate": 0.0001734861881468463, "loss": 0.25868260860443115, "num_input_tokens_seen": 991456, "step": 300, "train_runtime": 1377.248, "train_tokens_per_second": 719.882 }, { "epoch": 2.4193548387096775, "eval_loss": 2.238002061843872, "eval_runtime": 10.3632, "eval_samples_per_second": 5.018, "eval_steps_per_second": 2.509, "num_input_tokens_seen": 991456, "step": 300 }, { "epoch": 2.4274193548387095, "grad_norm": 0.5959520936012268, "learning_rate": 0.00017331342580333706, "loss": 0.2244856059551239, "num_input_tokens_seen": 994790, "step": 301, "train_runtime": 1395.7066, "train_tokens_per_second": 712.75 }, { "epoch": 2.435483870967742, "grad_norm": 0.7241173982620239, "learning_rate": 0.00017314018905498931, "loss": 0.304779589176178, "num_input_tokens_seen": 998138, "step": 302, "train_runtime": 1399.7667, "train_tokens_per_second": 713.075 }, { "epoch": 2.443548387096774, "grad_norm": 0.6994946002960205, "learning_rate": 0.00017296647902280312, "loss": 0.3222126364707947, "num_input_tokens_seen": 1001530, "step": 303, "train_runtime": 1403.8708, "train_tokens_per_second": 713.406 }, { "epoch": 2.4516129032258065, "grad_norm": 0.6702247858047485, "learning_rate": 0.00017279229683084103, "loss": 0.3149133324623108, "num_input_tokens_seen": 1004834, "step": 304, "train_runtime": 1407.8875, "train_tokens_per_second": 713.718 }, { "epoch": 2.4596774193548385, "grad_norm": 0.6507818698883057, "learning_rate": 0.00017261764360622102, "loss": 0.2576565742492676, "num_input_tokens_seen": 1007956, "step": 305, "train_runtime": 1411.7075, "train_tokens_per_second": 713.998 }, { "epoch": 2.467741935483871, "grad_norm": 0.630657970905304, "learning_rate": 0.00017244252047910892, "loss": 0.2836134731769562, "num_input_tokens_seen": 1011316, "step": 306, "train_runtime": 1415.8037, "train_tokens_per_second": 714.305 }, { "epoch": 2.475806451612903, "grad_norm": 0.5873962640762329, "learning_rate": 0.00017226692858271134, "loss": 0.27408260107040405, "num_input_tokens_seen": 1014636, "step": 307, "train_runtime": 1419.8461, "train_tokens_per_second": 714.61 }, { "epoch": 2.4838709677419355, "grad_norm": 0.6181919574737549, "learning_rate": 0.00017209086905326833, "loss": 0.2932998836040497, "num_input_tokens_seen": 1018032, "step": 308, "train_runtime": 1423.9313, "train_tokens_per_second": 714.945 }, { "epoch": 2.491935483870968, "grad_norm": 0.7245927453041077, "learning_rate": 0.0001719143430300458, "loss": 0.28670936822891235, "num_input_tokens_seen": 1021416, "step": 309, "train_runtime": 1428.0038, "train_tokens_per_second": 715.275 }, { "epoch": 2.5, "grad_norm": 0.562571108341217, "learning_rate": 0.00017173735165532846, "loss": 0.29280805587768555, "num_input_tokens_seen": 1024810, "step": 310, "train_runtime": 1432.1081, "train_tokens_per_second": 715.595 }, { "epoch": 2.508064516129032, "grad_norm": 0.6529526114463806, "learning_rate": 0.00017155989607441213, "loss": 0.3507098853588104, "num_input_tokens_seen": 1028170, "step": 311, "train_runtime": 1436.2611, "train_tokens_per_second": 715.866 }, { "epoch": 2.5161290322580645, "grad_norm": 0.6109700202941895, "learning_rate": 0.00017138197743559654, "loss": 0.3289310932159424, "num_input_tokens_seen": 1031370, "step": 312, "train_runtime": 1440.118, "train_tokens_per_second": 716.17 }, { "epoch": 2.524193548387097, "grad_norm": 0.612596333026886, "learning_rate": 0.0001712035968901778, "loss": 0.30889034271240234, "num_input_tokens_seen": 1034628, "step": 313, "train_runtime": 1444.035, "train_tokens_per_second": 716.484 }, { "epoch": 2.532258064516129, "grad_norm": 0.5865165591239929, "learning_rate": 0.00017102475559244105, "loss": 0.24781779944896698, "num_input_tokens_seen": 1037850, "step": 314, "train_runtime": 1447.9152, "train_tokens_per_second": 716.789 }, { "epoch": 2.540322580645161, "grad_norm": 0.6537047028541565, "learning_rate": 0.00017084545469965283, "loss": 0.3204243779182434, "num_input_tokens_seen": 1041268, "step": 315, "train_runtime": 1452.0132, "train_tokens_per_second": 717.12 }, { "epoch": 2.5483870967741935, "grad_norm": 0.6006045341491699, "learning_rate": 0.00017066569537205371, "loss": 0.2686625123023987, "num_input_tokens_seen": 1044620, "step": 316, "train_runtime": 1456.0387, "train_tokens_per_second": 717.44 }, { "epoch": 2.556451612903226, "grad_norm": 0.5934163928031921, "learning_rate": 0.00017048547877285077, "loss": 0.28129106760025024, "num_input_tokens_seen": 1047986, "step": 317, "train_runtime": 1460.0652, "train_tokens_per_second": 717.767 }, { "epoch": 2.564516129032258, "grad_norm": 0.7280172109603882, "learning_rate": 0.00017030480606821, "loss": 0.27025726437568665, "num_input_tokens_seen": 1051334, "step": 318, "train_runtime": 1464.0809, "train_tokens_per_second": 718.085 }, { "epoch": 2.5725806451612905, "grad_norm": 0.6704394817352295, "learning_rate": 0.00017012367842724887, "loss": 0.30058684945106506, "num_input_tokens_seen": 1054698, "step": 319, "train_runtime": 1468.1115, "train_tokens_per_second": 718.405 }, { "epoch": 2.5806451612903225, "grad_norm": 0.6334468126296997, "learning_rate": 0.00016994209702202867, "loss": 0.31510958075523376, "num_input_tokens_seen": 1058022, "step": 320, "train_runtime": 1472.1413, "train_tokens_per_second": 718.696 }, { "epoch": 2.588709677419355, "grad_norm": 0.7871665954589844, "learning_rate": 0.00016976006302754702, "loss": 0.3096451163291931, "num_input_tokens_seen": 1061014, "step": 321, "train_runtime": 1475.7415, "train_tokens_per_second": 718.97 }, { "epoch": 2.596774193548387, "grad_norm": 0.636999249458313, "learning_rate": 0.0001695775776217301, "loss": 0.2508867383003235, "num_input_tokens_seen": 1064402, "step": 322, "train_runtime": 1479.7786, "train_tokens_per_second": 719.298 }, { "epoch": 2.6048387096774195, "grad_norm": 0.616060733795166, "learning_rate": 0.00016939464198542523, "loss": 0.2926797866821289, "num_input_tokens_seen": 1067606, "step": 323, "train_runtime": 1483.6203, "train_tokens_per_second": 719.595 }, { "epoch": 2.6129032258064515, "grad_norm": 0.6996451616287231, "learning_rate": 0.00016921125730239307, "loss": 0.33650434017181396, "num_input_tokens_seen": 1070900, "step": 324, "train_runtime": 1487.5405, "train_tokens_per_second": 719.913 }, { "epoch": 2.620967741935484, "grad_norm": 0.5791569352149963, "learning_rate": 0.00016902742475930006, "loss": 0.25794506072998047, "num_input_tokens_seen": 1074256, "step": 325, "train_runtime": 1491.5897, "train_tokens_per_second": 720.209 }, { "epoch": 2.629032258064516, "grad_norm": 0.6315230131149292, "learning_rate": 0.00016884314554571064, "loss": 0.2916939854621887, "num_input_tokens_seen": 1077650, "step": 326, "train_runtime": 1495.6384, "train_tokens_per_second": 720.528 }, { "epoch": 2.6370967741935485, "grad_norm": 0.702532172203064, "learning_rate": 0.0001686584208540797, "loss": 0.31128430366516113, "num_input_tokens_seen": 1080768, "step": 327, "train_runtime": 1499.4368, "train_tokens_per_second": 720.783 }, { "epoch": 2.6451612903225805, "grad_norm": 0.680218517780304, "learning_rate": 0.00016847325187974477, "loss": 0.2789115309715271, "num_input_tokens_seen": 1083952, "step": 328, "train_runtime": 1503.275, "train_tokens_per_second": 721.06 }, { "epoch": 2.653225806451613, "grad_norm": 0.7413024306297302, "learning_rate": 0.00016828763982091826, "loss": 0.3262504041194916, "num_input_tokens_seen": 1087238, "step": 329, "train_runtime": 1507.2815, "train_tokens_per_second": 721.324 }, { "epoch": 2.661290322580645, "grad_norm": 0.6193671822547913, "learning_rate": 0.00016810158587867973, "loss": 0.2687387764453888, "num_input_tokens_seen": 1090512, "step": 330, "train_runtime": 1511.2968, "train_tokens_per_second": 721.574 }, { "epoch": 2.661290322580645, "eval_loss": 2.222731590270996, "eval_runtime": 10.3677, "eval_samples_per_second": 5.016, "eval_steps_per_second": 2.508, "num_input_tokens_seen": 1090512, "step": 330 }, { "epoch": 2.6693548387096775, "grad_norm": 0.6186102032661438, "learning_rate": 0.00016791509125696816, "loss": 0.3041277527809143, "num_input_tokens_seen": 1093854, "step": 331, "train_runtime": 1529.7169, "train_tokens_per_second": 715.07 }, { "epoch": 2.6774193548387095, "grad_norm": 0.6018481850624084, "learning_rate": 0.00016772815716257412, "loss": 0.2938111126422882, "num_input_tokens_seen": 1097220, "step": 332, "train_runtime": 1533.7479, "train_tokens_per_second": 715.385 }, { "epoch": 2.685483870967742, "grad_norm": 0.6231778860092163, "learning_rate": 0.00016754078480513197, "loss": 0.31392157077789307, "num_input_tokens_seen": 1100558, "step": 333, "train_runtime": 1537.8319, "train_tokens_per_second": 715.656 }, { "epoch": 2.693548387096774, "grad_norm": 0.6355130076408386, "learning_rate": 0.00016735297539711204, "loss": 0.2564769983291626, "num_input_tokens_seen": 1103750, "step": 334, "train_runtime": 1541.6907, "train_tokens_per_second": 715.935 }, { "epoch": 2.7016129032258065, "grad_norm": 0.7109224796295166, "learning_rate": 0.00016716473015381276, "loss": 0.3542025685310364, "num_input_tokens_seen": 1107152, "step": 335, "train_runtime": 1545.8031, "train_tokens_per_second": 716.231 }, { "epoch": 2.709677419354839, "grad_norm": 0.6006855964660645, "learning_rate": 0.0001669760502933528, "loss": 0.3422238528728485, "num_input_tokens_seen": 1110494, "step": 336, "train_runtime": 1549.8692, "train_tokens_per_second": 716.508 }, { "epoch": 2.717741935483871, "grad_norm": 0.6025466322898865, "learning_rate": 0.00016678693703666325, "loss": 0.2790500223636627, "num_input_tokens_seen": 1113802, "step": 337, "train_runtime": 1553.9962, "train_tokens_per_second": 716.734 }, { "epoch": 2.725806451612903, "grad_norm": 0.6767451167106628, "learning_rate": 0.00016659739160747967, "loss": 0.3348698019981384, "num_input_tokens_seen": 1117078, "step": 338, "train_runtime": 1558.0054, "train_tokens_per_second": 716.992 }, { "epoch": 2.7338709677419355, "grad_norm": 0.7819514274597168, "learning_rate": 0.00016640741523233407, "loss": 0.33901745080947876, "num_input_tokens_seen": 1120186, "step": 339, "train_runtime": 1561.7798, "train_tokens_per_second": 717.25 }, { "epoch": 2.741935483870968, "grad_norm": 0.6177900433540344, "learning_rate": 0.00016621700914054718, "loss": 0.3110594153404236, "num_input_tokens_seen": 1123568, "step": 340, "train_runtime": 1565.8546, "train_tokens_per_second": 717.543 }, { "epoch": 2.75, "grad_norm": 0.6533312797546387, "learning_rate": 0.00016602617456422034, "loss": 0.2856423258781433, "num_input_tokens_seen": 1126926, "step": 341, "train_runtime": 1569.9333, "train_tokens_per_second": 717.818 }, { "epoch": 2.758064516129032, "grad_norm": 0.6791844964027405, "learning_rate": 0.00016583491273822765, "loss": 0.32431840896606445, "num_input_tokens_seen": 1130262, "step": 342, "train_runtime": 1573.9778, "train_tokens_per_second": 718.093 }, { "epoch": 2.7661290322580645, "grad_norm": 0.7280325889587402, "learning_rate": 0.00016564322490020776, "loss": 0.3399568498134613, "num_input_tokens_seen": 1133612, "step": 343, "train_runtime": 1578.0114, "train_tokens_per_second": 718.38 }, { "epoch": 2.774193548387097, "grad_norm": 1.0895047187805176, "learning_rate": 0.00016545111229055614, "loss": 0.336514949798584, "num_input_tokens_seen": 1136944, "step": 344, "train_runtime": 1582.0241, "train_tokens_per_second": 718.664 }, { "epoch": 2.782258064516129, "grad_norm": 0.7008950710296631, "learning_rate": 0.00016525857615241687, "loss": 0.35175198316574097, "num_input_tokens_seen": 1140396, "step": 345, "train_runtime": 1586.0972, "train_tokens_per_second": 718.995 }, { "epoch": 2.790322580645161, "grad_norm": 0.579539954662323, "learning_rate": 0.00016506561773167464, "loss": 0.29808709025382996, "num_input_tokens_seen": 1143862, "step": 346, "train_runtime": 1590.3586, "train_tokens_per_second": 719.248 }, { "epoch": 2.7983870967741935, "grad_norm": 0.6034091711044312, "learning_rate": 0.00016487223827694672, "loss": 0.3193822503089905, "num_input_tokens_seen": 1147244, "step": 347, "train_runtime": 1594.4136, "train_tokens_per_second": 719.54 }, { "epoch": 2.806451612903226, "grad_norm": 0.62319415807724, "learning_rate": 0.00016467843903957485, "loss": 0.28525224328041077, "num_input_tokens_seen": 1150452, "step": 348, "train_runtime": 1598.2467, "train_tokens_per_second": 719.821 }, { "epoch": 2.814516129032258, "grad_norm": 0.6607264280319214, "learning_rate": 0.00016448422127361706, "loss": 0.2740340828895569, "num_input_tokens_seen": 1153642, "step": 349, "train_runtime": 1602.068, "train_tokens_per_second": 720.096 }, { "epoch": 2.8225806451612905, "grad_norm": 0.7429677844047546, "learning_rate": 0.00016428958623583982, "loss": 0.28102120757102966, "num_input_tokens_seen": 1156888, "step": 350, "train_runtime": 1606.0395, "train_tokens_per_second": 720.336 }, { "epoch": 2.8306451612903225, "grad_norm": 0.6027553677558899, "learning_rate": 0.0001640945351857096, "loss": 0.2649904489517212, "num_input_tokens_seen": 1160198, "step": 351, "train_runtime": 1610.0476, "train_tokens_per_second": 720.599 }, { "epoch": 2.838709677419355, "grad_norm": 0.6074771285057068, "learning_rate": 0.0001638990693853848, "loss": 0.2550981938838959, "num_input_tokens_seen": 1163444, "step": 352, "train_runtime": 1614.0033, "train_tokens_per_second": 720.844 }, { "epoch": 2.846774193548387, "grad_norm": 0.7516021132469177, "learning_rate": 0.00016370319009970777, "loss": 0.27090102434158325, "num_input_tokens_seen": 1166582, "step": 353, "train_runtime": 1617.7897, "train_tokens_per_second": 721.096 }, { "epoch": 2.8548387096774195, "grad_norm": 0.608556866645813, "learning_rate": 0.0001635068985961965, "loss": 0.2796551585197449, "num_input_tokens_seen": 1169882, "step": 354, "train_runtime": 1621.7832, "train_tokens_per_second": 721.355 }, { "epoch": 2.8629032258064515, "grad_norm": 0.659274697303772, "learning_rate": 0.00016331019614503623, "loss": 0.34103769063949585, "num_input_tokens_seen": 1173312, "step": 355, "train_runtime": 1625.8724, "train_tokens_per_second": 721.651 }, { "epoch": 2.870967741935484, "grad_norm": 0.6731792688369751, "learning_rate": 0.00016311308401907153, "loss": 0.32820090651512146, "num_input_tokens_seen": 1176774, "step": 356, "train_runtime": 1630.0017, "train_tokens_per_second": 721.947 }, { "epoch": 2.879032258064516, "grad_norm": 0.6033483743667603, "learning_rate": 0.00016291556349379795, "loss": 0.28098687529563904, "num_input_tokens_seen": 1180230, "step": 357, "train_runtime": 1634.1041, "train_tokens_per_second": 722.249 }, { "epoch": 2.8870967741935485, "grad_norm": 0.6251856088638306, "learning_rate": 0.0001627176358473537, "loss": 0.3627236485481262, "num_input_tokens_seen": 1183606, "step": 358, "train_runtime": 1638.1672, "train_tokens_per_second": 722.518 }, { "epoch": 2.8951612903225805, "grad_norm": 0.64890456199646, "learning_rate": 0.0001625193023605115, "loss": 0.2701757252216339, "num_input_tokens_seen": 1186796, "step": 359, "train_runtime": 1642.0043, "train_tokens_per_second": 722.773 }, { "epoch": 2.903225806451613, "grad_norm": 0.6447115540504456, "learning_rate": 0.00016232056431667017, "loss": 0.3284357786178589, "num_input_tokens_seen": 1190008, "step": 360, "train_runtime": 1645.8682, "train_tokens_per_second": 723.028 }, { "epoch": 2.903225806451613, "eval_loss": 2.13553524017334, "eval_runtime": 10.4034, "eval_samples_per_second": 4.998, "eval_steps_per_second": 2.499, "num_input_tokens_seen": 1190008, "step": 360 }, { "epoch": 2.911290322580645, "grad_norm": 0.5953078866004944, "learning_rate": 0.0001621214230018464, "loss": 0.307175874710083, "num_input_tokens_seen": 1193388, "step": 361, "train_runtime": 1664.3092, "train_tokens_per_second": 717.047 }, { "epoch": 2.9193548387096775, "grad_norm": 0.6266926527023315, "learning_rate": 0.00016192187970466644, "loss": 0.27196913957595825, "num_input_tokens_seen": 1196530, "step": 362, "train_runtime": 1668.1264, "train_tokens_per_second": 717.29 }, { "epoch": 2.9274193548387095, "grad_norm": 0.5678778886795044, "learning_rate": 0.00016172193571635767, "loss": 0.2596748471260071, "num_input_tokens_seen": 1199850, "step": 363, "train_runtime": 1672.1956, "train_tokens_per_second": 717.53 }, { "epoch": 2.935483870967742, "grad_norm": 0.7649815082550049, "learning_rate": 0.00016152159233074037, "loss": 0.3512059450149536, "num_input_tokens_seen": 1203098, "step": 364, "train_runtime": 1676.1191, "train_tokens_per_second": 717.788 }, { "epoch": 2.943548387096774, "grad_norm": 0.6907058954238892, "learning_rate": 0.0001613208508442193, "loss": 0.3612962067127228, "num_input_tokens_seen": 1206496, "step": 365, "train_runtime": 1680.2229, "train_tokens_per_second": 718.057 }, { "epoch": 2.9516129032258065, "grad_norm": 0.5369240045547485, "learning_rate": 0.0001611197125557752, "loss": 0.2572643756866455, "num_input_tokens_seen": 1209652, "step": 366, "train_runtime": 1684.0732, "train_tokens_per_second": 718.289 }, { "epoch": 2.959677419354839, "grad_norm": 0.5834548473358154, "learning_rate": 0.00016091817876695655, "loss": 0.27215975522994995, "num_input_tokens_seen": 1212982, "step": 367, "train_runtime": 1688.117, "train_tokens_per_second": 718.541 }, { "epoch": 2.967741935483871, "grad_norm": 0.6674952507019043, "learning_rate": 0.00016071625078187114, "loss": 0.2797396779060364, "num_input_tokens_seen": 1216172, "step": 368, "train_runtime": 1691.981, "train_tokens_per_second": 718.786 }, { "epoch": 2.975806451612903, "grad_norm": 0.6768476366996765, "learning_rate": 0.0001605139299071774, "loss": 0.3542667329311371, "num_input_tokens_seen": 1219592, "step": 369, "train_runtime": 1696.1081, "train_tokens_per_second": 719.053 }, { "epoch": 2.9838709677419355, "grad_norm": 0.7091079354286194, "learning_rate": 0.00016031121745207626, "loss": 0.2957330048084259, "num_input_tokens_seen": 1222706, "step": 370, "train_runtime": 1699.921, "train_tokens_per_second": 719.272 }, { "epoch": 2.991935483870968, "grad_norm": 0.7082553505897522, "learning_rate": 0.00016010811472830252, "loss": 0.2873592972755432, "num_input_tokens_seen": 1225804, "step": 371, "train_runtime": 1703.6255, "train_tokens_per_second": 719.527 }, { "epoch": 3.0, "grad_norm": 0.5840099453926086, "learning_rate": 0.0001599046230501163, "loss": 0.26493021845817566, "num_input_tokens_seen": 1229126, "step": 372, "train_runtime": 1707.6656, "train_tokens_per_second": 719.77 }, { "epoch": 3.0080645161290325, "grad_norm": 0.5550098419189453, "learning_rate": 0.00015970074373429464, "loss": 0.1822848916053772, "num_input_tokens_seen": 1232466, "step": 373, "train_runtime": 1711.6892, "train_tokens_per_second": 720.029 }, { "epoch": 3.0161290322580645, "grad_norm": 0.5182167291641235, "learning_rate": 0.00015949647810012301, "loss": 0.18206968903541565, "num_input_tokens_seen": 1235888, "step": 374, "train_runtime": 1715.7992, "train_tokens_per_second": 720.299 }, { "epoch": 3.024193548387097, "grad_norm": 0.48984354734420776, "learning_rate": 0.0001592918274693866, "loss": 0.16470718383789062, "num_input_tokens_seen": 1239272, "step": 375, "train_runtime": 1719.8357, "train_tokens_per_second": 720.576 }, { "epoch": 3.032258064516129, "grad_norm": 0.528260350227356, "learning_rate": 0.000159086793166362, "loss": 0.1563296616077423, "num_input_tokens_seen": 1242570, "step": 376, "train_runtime": 1723.8445, "train_tokens_per_second": 720.813 }, { "epoch": 3.0403225806451615, "grad_norm": 0.5711667537689209, "learning_rate": 0.00015888137651780845, "loss": 0.1582183986902237, "num_input_tokens_seen": 1245794, "step": 377, "train_runtime": 1727.6991, "train_tokens_per_second": 721.071 }, { "epoch": 3.0483870967741935, "grad_norm": 0.558810293674469, "learning_rate": 0.0001586755788529593, "loss": 0.14666521549224854, "num_input_tokens_seen": 1249196, "step": 378, "train_runtime": 1731.7597, "train_tokens_per_second": 721.345 }, { "epoch": 3.056451612903226, "grad_norm": 0.8101882338523865, "learning_rate": 0.00015846940150351344, "loss": 0.18561005592346191, "num_input_tokens_seen": 1252432, "step": 379, "train_runtime": 1735.6067, "train_tokens_per_second": 721.61 }, { "epoch": 3.064516129032258, "grad_norm": 0.7118780612945557, "learning_rate": 0.00015826284580362668, "loss": 0.1576988697052002, "num_input_tokens_seen": 1255716, "step": 380, "train_runtime": 1739.5733, "train_tokens_per_second": 721.853 }, { "epoch": 3.0725806451612905, "grad_norm": 0.8138672113418579, "learning_rate": 0.00015805591308990308, "loss": 0.18212170898914337, "num_input_tokens_seen": 1259090, "step": 381, "train_runtime": 1743.649, "train_tokens_per_second": 722.101 }, { "epoch": 3.0806451612903225, "grad_norm": 0.7878643274307251, "learning_rate": 0.00015784860470138633, "loss": 0.18663600087165833, "num_input_tokens_seen": 1262366, "step": 382, "train_runtime": 1747.5729, "train_tokens_per_second": 722.354 }, { "epoch": 3.088709677419355, "grad_norm": 0.7864635586738586, "learning_rate": 0.00015764092197955112, "loss": 0.18056976795196533, "num_input_tokens_seen": 1265824, "step": 383, "train_runtime": 1751.6686, "train_tokens_per_second": 722.639 }, { "epoch": 3.096774193548387, "grad_norm": 0.7926681637763977, "learning_rate": 0.00015743286626829437, "loss": 0.12878452241420746, "num_input_tokens_seen": 1269094, "step": 384, "train_runtime": 1755.6443, "train_tokens_per_second": 722.865 }, { "epoch": 3.1048387096774195, "grad_norm": 0.7887319922447205, "learning_rate": 0.00015722443891392658, "loss": 0.18509285151958466, "num_input_tokens_seen": 1272194, "step": 385, "train_runtime": 1759.477, "train_tokens_per_second": 723.052 }, { "epoch": 3.1129032258064515, "grad_norm": 0.6997226476669312, "learning_rate": 0.00015701564126516314, "loss": 0.16511765122413635, "num_input_tokens_seen": 1275400, "step": 386, "train_runtime": 1763.3141, "train_tokens_per_second": 723.297 }, { "epoch": 3.120967741935484, "grad_norm": 0.8127301931381226, "learning_rate": 0.00015680647467311557, "loss": 0.1577507108449936, "num_input_tokens_seen": 1278726, "step": 387, "train_runtime": 1767.3065, "train_tokens_per_second": 723.545 }, { "epoch": 3.129032258064516, "grad_norm": 0.7188698053359985, "learning_rate": 0.00015659694049128286, "loss": 0.19617129862308502, "num_input_tokens_seen": 1281930, "step": 388, "train_runtime": 1771.1468, "train_tokens_per_second": 723.785 }, { "epoch": 3.1370967741935485, "grad_norm": 0.4875478744506836, "learning_rate": 0.0001563870400755425, "loss": 0.11804057657718658, "num_input_tokens_seen": 1285208, "step": 389, "train_runtime": 1775.1314, "train_tokens_per_second": 724.007 }, { "epoch": 3.1451612903225805, "grad_norm": 0.7870502471923828, "learning_rate": 0.00015617677478414196, "loss": 0.17701417207717896, "num_input_tokens_seen": 1288420, "step": 390, "train_runtime": 1778.9788, "train_tokens_per_second": 724.247 }, { "epoch": 3.1451612903225805, "eval_loss": 2.6576552391052246, "eval_runtime": 10.2889, "eval_samples_per_second": 5.054, "eval_steps_per_second": 2.527, "num_input_tokens_seen": 1288420, "step": 390 } ], "logging_steps": 1, "max_steps": 1240, "num_input_tokens_seen": 1288420, "num_train_epochs": 10, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.929439261884288e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }