{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5000, "global_step": 965749, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001035465736956497, "grad_norm": 5.295963764190674, "learning_rate": 9.92e-06, "loss": 8.4977, "step": 1000 }, { "epoch": 0.002070931473912994, "grad_norm": 2.1808838844299316, "learning_rate": 1.992e-05, "loss": 5.7562, "step": 2000 }, { "epoch": 0.003106397210869491, "grad_norm": 2.5940399169921875, "learning_rate": 2.9920000000000005e-05, "loss": 5.1705, "step": 3000 }, { "epoch": 0.004141862947825988, "grad_norm": 2.5618865489959717, "learning_rate": 3.9920000000000004e-05, "loss": 4.7383, "step": 4000 }, { "epoch": 0.005177328684782485, "grad_norm": 2.15096116065979, "learning_rate": 4.992e-05, "loss": 4.4163, "step": 5000 }, { "epoch": 0.005177328684782485, "eval_loss": 4.266116142272949, "eval_runtime": 16.5978, "eval_samples_per_second": 2979.738, "eval_steps_per_second": 11.688, "step": 5000 }, { "epoch": 0.006212794421738982, "grad_norm": 2.0443923473358154, "learning_rate": 5e-05, "loss": 4.1981, "step": 6000 }, { "epoch": 0.007248260158695479, "grad_norm": 2.379178524017334, "learning_rate": 5e-05, "loss": 3.9849, "step": 7000 }, { "epoch": 0.008283725895651977, "grad_norm": 2.9000627994537354, "learning_rate": 5e-05, "loss": 3.856, "step": 8000 }, { "epoch": 0.009319191632608474, "grad_norm": 2.694687604904175, "learning_rate": 5e-05, "loss": 3.759, "step": 9000 }, { "epoch": 0.01035465736956497, "grad_norm": 1.9065606594085693, "learning_rate": 5e-05, "loss": 3.6538, "step": 10000 }, { "epoch": 0.01035465736956497, "eval_loss": 3.5857133865356445, "eval_runtime": 17.7332, "eval_samples_per_second": 2788.951, "eval_steps_per_second": 10.94, "step": 10000 }, { "epoch": 0.011390123106521467, "grad_norm": 1.7840981483459473, "learning_rate": 5e-05, "loss": 3.5709, "step": 11000 }, { "epoch": 0.012425588843477964, "grad_norm": 2.0488035678863525, "learning_rate": 5e-05, "loss": 3.5079, "step": 12000 }, { "epoch": 0.013461054580434461, "grad_norm": 1.8077210187911987, "learning_rate": 5e-05, "loss": 3.4421, "step": 13000 }, { "epoch": 0.014496520317390958, "grad_norm": 1.925595998764038, "learning_rate": 5e-05, "loss": 3.3707, "step": 14000 }, { "epoch": 0.015531986054347455, "grad_norm": 2.2106831073760986, "learning_rate": 5e-05, "loss": 3.3201, "step": 15000 }, { "epoch": 0.015531986054347455, "eval_loss": 3.2835636138916016, "eval_runtime": 24.2836, "eval_samples_per_second": 2036.641, "eval_steps_per_second": 7.989, "step": 15000 }, { "epoch": 0.016567451791303953, "grad_norm": 2.162048578262329, "learning_rate": 5e-05, "loss": 3.2932, "step": 16000 }, { "epoch": 0.01760291752826045, "grad_norm": 2.2526283264160156, "learning_rate": 5e-05, "loss": 3.2308, "step": 17000 }, { "epoch": 0.018638383265216947, "grad_norm": 1.7822258472442627, "learning_rate": 5e-05, "loss": 3.2012, "step": 18000 }, { "epoch": 0.019673849002173442, "grad_norm": 2.0987908840179443, "learning_rate": 5e-05, "loss": 3.1437, "step": 19000 }, { "epoch": 0.02070931473912994, "grad_norm": 1.7948940992355347, "learning_rate": 5e-05, "loss": 3.1402, "step": 20000 }, { "epoch": 0.02070931473912994, "eval_loss": 3.0880234241485596, "eval_runtime": 16.8033, "eval_samples_per_second": 2943.298, "eval_steps_per_second": 11.545, "step": 20000 }, { "epoch": 0.021744780476086436, "grad_norm": 1.7310450077056885, "learning_rate": 5e-05, "loss": 3.0985, "step": 21000 }, { "epoch": 0.022780246213042935, "grad_norm": 1.727274775505066, "learning_rate": 5e-05, "loss": 3.0633, "step": 22000 }, { "epoch": 0.02381571194999943, "grad_norm": 1.6332778930664062, "learning_rate": 5e-05, "loss": 3.0373, "step": 23000 }, { "epoch": 0.024851177686955928, "grad_norm": 1.7220966815948486, "learning_rate": 5e-05, "loss": 3.0169, "step": 24000 }, { "epoch": 0.025886643423912423, "grad_norm": 2.0234944820404053, "learning_rate": 5e-05, "loss": 2.9761, "step": 25000 }, { "epoch": 0.025886643423912423, "eval_loss": 2.950173854827881, "eval_runtime": 20.4691, "eval_samples_per_second": 2416.183, "eval_steps_per_second": 9.478, "step": 25000 }, { "epoch": 0.026922109160868922, "grad_norm": 1.9352872371673584, "learning_rate": 5e-05, "loss": 2.9704, "step": 26000 }, { "epoch": 0.027957574897825417, "grad_norm": 2.0853705406188965, "learning_rate": 5e-05, "loss": 2.9426, "step": 27000 }, { "epoch": 0.028993040634781916, "grad_norm": 1.5806540250778198, "learning_rate": 5e-05, "loss": 2.9182, "step": 28000 }, { "epoch": 0.03002850637173841, "grad_norm": 1.729049801826477, "learning_rate": 5e-05, "loss": 2.8912, "step": 29000 }, { "epoch": 0.03106397210869491, "grad_norm": 2.1811740398406982, "learning_rate": 5e-05, "loss": 2.8908, "step": 30000 }, { "epoch": 0.03106397210869491, "eval_loss": 2.8682405948638916, "eval_runtime": 16.5574, "eval_samples_per_second": 2987.007, "eval_steps_per_second": 11.717, "step": 30000 }, { "epoch": 0.032099437845651405, "grad_norm": 2.0321109294891357, "learning_rate": 5e-05, "loss": 2.8732, "step": 31000 }, { "epoch": 0.03313490358260791, "grad_norm": 2.065203905105591, "learning_rate": 5e-05, "loss": 2.8576, "step": 32000 }, { "epoch": 0.0341703693195644, "grad_norm": 1.7852224111557007, "learning_rate": 5e-05, "loss": 2.8378, "step": 33000 }, { "epoch": 0.0352058350565209, "grad_norm": 1.6953657865524292, "learning_rate": 5e-05, "loss": 2.8113, "step": 34000 }, { "epoch": 0.03624130079347739, "grad_norm": 1.9876961708068848, "learning_rate": 5e-05, "loss": 2.8135, "step": 35000 }, { "epoch": 0.03624130079347739, "eval_loss": 2.7857329845428467, "eval_runtime": 20.7337, "eval_samples_per_second": 2385.344, "eval_steps_per_second": 9.357, "step": 35000 }, { "epoch": 0.037276766530433894, "grad_norm": 2.0485334396362305, "learning_rate": 5e-05, "loss": 2.8003, "step": 36000 }, { "epoch": 0.03831223226739039, "grad_norm": 2.228898048400879, "learning_rate": 5e-05, "loss": 2.7635, "step": 37000 }, { "epoch": 0.039347698004346884, "grad_norm": 2.039750337600708, "learning_rate": 5e-05, "loss": 2.7655, "step": 38000 }, { "epoch": 0.04038316374130338, "grad_norm": 1.6063960790634155, "learning_rate": 5e-05, "loss": 2.7665, "step": 39000 }, { "epoch": 0.04141862947825988, "grad_norm": 1.679189920425415, "learning_rate": 5e-05, "loss": 2.7416, "step": 40000 }, { "epoch": 0.04141862947825988, "eval_loss": 2.7217838764190674, "eval_runtime": 17.7772, "eval_samples_per_second": 2782.047, "eval_steps_per_second": 10.913, "step": 40000 }, { "epoch": 0.04245409521521638, "grad_norm": 2.0102996826171875, "learning_rate": 5e-05, "loss": 2.7269, "step": 41000 }, { "epoch": 0.04348956095217287, "grad_norm": 1.8129605054855347, "learning_rate": 5e-05, "loss": 2.7308, "step": 42000 }, { "epoch": 0.04452502668912937, "grad_norm": 1.7243841886520386, "learning_rate": 5e-05, "loss": 2.6989, "step": 43000 }, { "epoch": 0.04556049242608587, "grad_norm": 1.8720917701721191, "learning_rate": 5e-05, "loss": 2.7039, "step": 44000 }, { "epoch": 0.046595958163042364, "grad_norm": 1.686995506286621, "learning_rate": 5e-05, "loss": 2.6911, "step": 45000 }, { "epoch": 0.046595958163042364, "eval_loss": 2.669776678085327, "eval_runtime": 24.3059, "eval_samples_per_second": 2034.773, "eval_steps_per_second": 7.982, "step": 45000 }, { "epoch": 0.04763142389999886, "grad_norm": 1.7721649408340454, "learning_rate": 5e-05, "loss": 2.6801, "step": 46000 }, { "epoch": 0.04866688963695536, "grad_norm": 1.7684073448181152, "learning_rate": 5e-05, "loss": 2.6667, "step": 47000 }, { "epoch": 0.049702355373911856, "grad_norm": 1.6600712537765503, "learning_rate": 5e-05, "loss": 2.6588, "step": 48000 }, { "epoch": 0.05073782111086835, "grad_norm": 1.6547819375991821, "learning_rate": 5e-05, "loss": 2.6545, "step": 49000 }, { "epoch": 0.05177328684782485, "grad_norm": 1.6996227502822876, "learning_rate": 5e-05, "loss": 2.6362, "step": 50000 }, { "epoch": 0.05177328684782485, "eval_loss": 2.6260855197906494, "eval_runtime": 18.6918, "eval_samples_per_second": 2645.921, "eval_steps_per_second": 10.379, "step": 50000 }, { "epoch": 0.05280875258478135, "grad_norm": 2.2706496715545654, "learning_rate": 5e-05, "loss": 2.63, "step": 51000 }, { "epoch": 0.053844218321737844, "grad_norm": 1.6703062057495117, "learning_rate": 5e-05, "loss": 2.6243, "step": 52000 }, { "epoch": 0.05487968405869434, "grad_norm": 1.6016428470611572, "learning_rate": 5e-05, "loss": 2.6266, "step": 53000 }, { "epoch": 0.055915149795650834, "grad_norm": 1.567052960395813, "learning_rate": 5e-05, "loss": 2.6115, "step": 54000 }, { "epoch": 0.056950615532607336, "grad_norm": 1.7554875612258911, "learning_rate": 5e-05, "loss": 2.6094, "step": 55000 }, { "epoch": 0.056950615532607336, "eval_loss": 2.5904622077941895, "eval_runtime": 24.6243, "eval_samples_per_second": 2008.46, "eval_steps_per_second": 7.878, "step": 55000 }, { "epoch": 0.05798608126956383, "grad_norm": 2.1339058876037598, "learning_rate": 5e-05, "loss": 2.6088, "step": 56000 }, { "epoch": 0.05902154700652033, "grad_norm": 2.116978406906128, "learning_rate": 5e-05, "loss": 2.5887, "step": 57000 }, { "epoch": 0.06005701274347682, "grad_norm": 1.9898152351379395, "learning_rate": 5e-05, "loss": 2.586, "step": 58000 }, { "epoch": 0.061092478480433324, "grad_norm": 1.9398534297943115, "learning_rate": 5e-05, "loss": 2.5883, "step": 59000 }, { "epoch": 0.06212794421738982, "grad_norm": 1.707964301109314, "learning_rate": 5e-05, "loss": 2.5718, "step": 60000 }, { "epoch": 0.06212794421738982, "eval_loss": 2.546797513961792, "eval_runtime": 17.6579, "eval_samples_per_second": 2800.839, "eval_steps_per_second": 10.987, "step": 60000 }, { "epoch": 0.06316340995434631, "grad_norm": 1.5731209516525269, "learning_rate": 5e-05, "loss": 2.5716, "step": 61000 }, { "epoch": 0.06419887569130281, "grad_norm": 2.1644086837768555, "learning_rate": 5e-05, "loss": 2.5653, "step": 62000 }, { "epoch": 0.0652343414282593, "grad_norm": 1.5315316915512085, "learning_rate": 5e-05, "loss": 2.5352, "step": 63000 }, { "epoch": 0.06626980716521581, "grad_norm": 1.4464577436447144, "learning_rate": 5e-05, "loss": 2.5527, "step": 64000 }, { "epoch": 0.06730527290217231, "grad_norm": 1.8106459379196167, "learning_rate": 5e-05, "loss": 2.5368, "step": 65000 }, { "epoch": 0.06730527290217231, "eval_loss": 2.5234298706054688, "eval_runtime": 20.0466, "eval_samples_per_second": 2467.096, "eval_steps_per_second": 9.677, "step": 65000 }, { "epoch": 0.0683407386391288, "grad_norm": 1.8276619911193848, "learning_rate": 5e-05, "loss": 2.5312, "step": 66000 }, { "epoch": 0.0693762043760853, "grad_norm": 1.4809328317642212, "learning_rate": 5e-05, "loss": 2.5363, "step": 67000 }, { "epoch": 0.0704116701130418, "grad_norm": 1.711206316947937, "learning_rate": 5e-05, "loss": 2.527, "step": 68000 }, { "epoch": 0.07144713584999829, "grad_norm": 1.7342501878738403, "learning_rate": 5e-05, "loss": 2.5039, "step": 69000 }, { "epoch": 0.07248260158695478, "grad_norm": 1.5501240491867065, "learning_rate": 5e-05, "loss": 2.5066, "step": 70000 }, { "epoch": 0.07248260158695478, "eval_loss": 2.489656448364258, "eval_runtime": 17.4023, "eval_samples_per_second": 2841.977, "eval_steps_per_second": 11.148, "step": 70000 }, { "epoch": 0.07351806732391128, "grad_norm": 1.970528483390808, "learning_rate": 5e-05, "loss": 2.5052, "step": 71000 }, { "epoch": 0.07455353306086779, "grad_norm": 1.6995656490325928, "learning_rate": 5e-05, "loss": 2.4875, "step": 72000 }, { "epoch": 0.07558899879782428, "grad_norm": 1.906000018119812, "learning_rate": 5e-05, "loss": 2.4934, "step": 73000 }, { "epoch": 0.07662446453478078, "grad_norm": 1.8074511289596558, "learning_rate": 5e-05, "loss": 2.494, "step": 74000 }, { "epoch": 0.07765993027173727, "grad_norm": 1.6471534967422485, "learning_rate": 5e-05, "loss": 2.4962, "step": 75000 }, { "epoch": 0.07765993027173727, "eval_loss": 2.462290048599243, "eval_runtime": 31.7738, "eval_samples_per_second": 1556.535, "eval_steps_per_second": 6.106, "step": 75000 }, { "epoch": 0.07869539600869377, "grad_norm": 1.7024720907211304, "learning_rate": 5e-05, "loss": 2.4639, "step": 76000 }, { "epoch": 0.07973086174565026, "grad_norm": 2.064392328262329, "learning_rate": 5e-05, "loss": 2.4731, "step": 77000 }, { "epoch": 0.08076632748260676, "grad_norm": 2.113358497619629, "learning_rate": 5e-05, "loss": 2.4613, "step": 78000 }, { "epoch": 0.08180179321956327, "grad_norm": 1.6446911096572876, "learning_rate": 5e-05, "loss": 2.4614, "step": 79000 }, { "epoch": 0.08283725895651976, "grad_norm": 1.7507164478302002, "learning_rate": 5e-05, "loss": 2.4637, "step": 80000 }, { "epoch": 0.08283725895651976, "eval_loss": 2.445667266845703, "eval_runtime": 16.9458, "eval_samples_per_second": 2918.533, "eval_steps_per_second": 11.448, "step": 80000 }, { "epoch": 0.08387272469347626, "grad_norm": 1.8405325412750244, "learning_rate": 5e-05, "loss": 2.4431, "step": 81000 }, { "epoch": 0.08490819043043275, "grad_norm": 1.7342174053192139, "learning_rate": 5e-05, "loss": 2.4382, "step": 82000 }, { "epoch": 0.08594365616738925, "grad_norm": 1.7074140310287476, "learning_rate": 5e-05, "loss": 2.446, "step": 83000 }, { "epoch": 0.08697912190434574, "grad_norm": 1.7877367734909058, "learning_rate": 5e-05, "loss": 2.4437, "step": 84000 }, { "epoch": 0.08801458764130224, "grad_norm": 1.3164676427841187, "learning_rate": 5e-05, "loss": 2.4365, "step": 85000 }, { "epoch": 0.08801458764130224, "eval_loss": 2.418172597885132, "eval_runtime": 17.7099, "eval_samples_per_second": 2792.626, "eval_steps_per_second": 10.954, "step": 85000 }, { "epoch": 0.08905005337825873, "grad_norm": 1.9907282590866089, "learning_rate": 5e-05, "loss": 2.427, "step": 86000 }, { "epoch": 0.09008551911521524, "grad_norm": 1.7687242031097412, "learning_rate": 5e-05, "loss": 2.4223, "step": 87000 }, { "epoch": 0.09112098485217174, "grad_norm": 1.5485864877700806, "learning_rate": 5e-05, "loss": 2.4204, "step": 88000 }, { "epoch": 0.09215645058912823, "grad_norm": 1.7300666570663452, "learning_rate": 5e-05, "loss": 2.4314, "step": 89000 }, { "epoch": 0.09319191632608473, "grad_norm": 2.0416300296783447, "learning_rate": 5e-05, "loss": 2.4154, "step": 90000 }, { "epoch": 0.09319191632608473, "eval_loss": 2.4060208797454834, "eval_runtime": 18.224, "eval_samples_per_second": 2713.838, "eval_steps_per_second": 10.645, "step": 90000 }, { "epoch": 0.09422738206304122, "grad_norm": 1.6988297700881958, "learning_rate": 5e-05, "loss": 2.406, "step": 91000 }, { "epoch": 0.09526284779999772, "grad_norm": 1.6872934103012085, "learning_rate": 5e-05, "loss": 2.407, "step": 92000 }, { "epoch": 0.09629831353695421, "grad_norm": 1.7713836431503296, "learning_rate": 5e-05, "loss": 2.4032, "step": 93000 }, { "epoch": 0.09733377927391072, "grad_norm": 1.6413403749465942, "learning_rate": 5e-05, "loss": 2.3996, "step": 94000 }, { "epoch": 0.09836924501086722, "grad_norm": 1.9076873064041138, "learning_rate": 5e-05, "loss": 2.4004, "step": 95000 }, { "epoch": 0.09836924501086722, "eval_loss": 2.380493640899658, "eval_runtime": 17.1775, "eval_samples_per_second": 2879.167, "eval_steps_per_second": 11.294, "step": 95000 }, { "epoch": 0.09940471074782371, "grad_norm": 1.6223602294921875, "learning_rate": 5e-05, "loss": 2.3839, "step": 96000 }, { "epoch": 0.10044017648478021, "grad_norm": 1.9881786108016968, "learning_rate": 5e-05, "loss": 2.4024, "step": 97000 }, { "epoch": 0.1014756422217367, "grad_norm": 1.5491753816604614, "learning_rate": 5e-05, "loss": 2.3796, "step": 98000 }, { "epoch": 0.1025111079586932, "grad_norm": 1.735318899154663, "learning_rate": 5e-05, "loss": 2.3753, "step": 99000 }, { "epoch": 0.1035465736956497, "grad_norm": 1.5919784307479858, "learning_rate": 5e-05, "loss": 2.3768, "step": 100000 }, { "epoch": 0.1035465736956497, "eval_loss": 2.3675897121429443, "eval_runtime": 18.2938, "eval_samples_per_second": 2703.485, "eval_steps_per_second": 10.605, "step": 100000 }, { "epoch": 0.10458203943260619, "grad_norm": 1.3763296604156494, "learning_rate": 5e-05, "loss": 2.3749, "step": 101000 }, { "epoch": 0.1056175051695627, "grad_norm": 1.8743693828582764, "learning_rate": 5e-05, "loss": 2.3675, "step": 102000 }, { "epoch": 0.10665297090651919, "grad_norm": 1.7040822505950928, "learning_rate": 5e-05, "loss": 2.3778, "step": 103000 }, { "epoch": 0.10768843664347569, "grad_norm": 1.355368971824646, "learning_rate": 5e-05, "loss": 2.3658, "step": 104000 }, { "epoch": 0.10872390238043218, "grad_norm": 1.8316023349761963, "learning_rate": 5e-05, "loss": 2.3613, "step": 105000 }, { "epoch": 0.10872390238043218, "eval_loss": 2.3513875007629395, "eval_runtime": 20.1363, "eval_samples_per_second": 2456.116, "eval_steps_per_second": 9.634, "step": 105000 }, { "epoch": 0.10975936811738868, "grad_norm": 1.6225669384002686, "learning_rate": 5e-05, "loss": 2.3666, "step": 106000 }, { "epoch": 0.11079483385434517, "grad_norm": 1.6963558197021484, "learning_rate": 5e-05, "loss": 2.3507, "step": 107000 }, { "epoch": 0.11183029959130167, "grad_norm": 1.7828996181488037, "learning_rate": 5e-05, "loss": 2.3457, "step": 108000 }, { "epoch": 0.11286576532825818, "grad_norm": 1.6532371044158936, "learning_rate": 5e-05, "loss": 2.3637, "step": 109000 }, { "epoch": 0.11390123106521467, "grad_norm": 2.0967419147491455, "learning_rate": 5e-05, "loss": 2.3392, "step": 110000 }, { "epoch": 0.11390123106521467, "eval_loss": 2.337290048599243, "eval_runtime": 17.1687, "eval_samples_per_second": 2880.654, "eval_steps_per_second": 11.3, "step": 110000 }, { "epoch": 0.11493669680217117, "grad_norm": 2.1648826599121094, "learning_rate": 5e-05, "loss": 2.3377, "step": 111000 }, { "epoch": 0.11597216253912766, "grad_norm": 1.6574516296386719, "learning_rate": 5e-05, "loss": 2.3364, "step": 112000 }, { "epoch": 0.11700762827608416, "grad_norm": 1.8057987689971924, "learning_rate": 5e-05, "loss": 2.3341, "step": 113000 }, { "epoch": 0.11804309401304065, "grad_norm": 1.8663420677185059, "learning_rate": 5e-05, "loss": 2.3392, "step": 114000 }, { "epoch": 0.11907855974999715, "grad_norm": 1.688050389289856, "learning_rate": 5e-05, "loss": 2.3386, "step": 115000 }, { "epoch": 0.11907855974999715, "eval_loss": 2.3217532634735107, "eval_runtime": 17.6363, "eval_samples_per_second": 2804.28, "eval_steps_per_second": 11.0, "step": 115000 }, { "epoch": 0.12011402548695364, "grad_norm": 2.507835626602173, "learning_rate": 5e-05, "loss": 2.3248, "step": 116000 }, { "epoch": 0.12114949122391015, "grad_norm": 1.6291868686676025, "learning_rate": 5e-05, "loss": 2.325, "step": 117000 }, { "epoch": 0.12218495696086665, "grad_norm": 1.5350573062896729, "learning_rate": 5e-05, "loss": 2.3386, "step": 118000 }, { "epoch": 0.12322042269782314, "grad_norm": 1.751421332359314, "learning_rate": 5e-05, "loss": 2.3197, "step": 119000 }, { "epoch": 0.12425588843477964, "grad_norm": 1.9885656833648682, "learning_rate": 5e-05, "loss": 2.3221, "step": 120000 }, { "epoch": 0.12425588843477964, "eval_loss": 2.2982702255249023, "eval_runtime": 18.9907, "eval_samples_per_second": 2604.28, "eval_steps_per_second": 10.216, "step": 120000 }, { "epoch": 0.12529135417173615, "grad_norm": 1.4570821523666382, "learning_rate": 5e-05, "loss": 2.3159, "step": 121000 }, { "epoch": 0.12632681990869263, "grad_norm": 1.4419127702713013, "learning_rate": 5e-05, "loss": 2.3184, "step": 122000 }, { "epoch": 0.12736228564564914, "grad_norm": 1.6944835186004639, "learning_rate": 5e-05, "loss": 2.3131, "step": 123000 }, { "epoch": 0.12839775138260562, "grad_norm": 1.7291606664657593, "learning_rate": 5e-05, "loss": 2.3031, "step": 124000 }, { "epoch": 0.12943321711956213, "grad_norm": 1.6769369840621948, "learning_rate": 5e-05, "loss": 2.2995, "step": 125000 }, { "epoch": 0.12943321711956213, "eval_loss": 2.295583724975586, "eval_runtime": 31.5701, "eval_samples_per_second": 1566.579, "eval_steps_per_second": 6.145, "step": 125000 }, { "epoch": 0.1304686828565186, "grad_norm": 1.7094770669937134, "learning_rate": 5e-05, "loss": 2.303, "step": 126000 }, { "epoch": 0.13150414859347512, "grad_norm": 1.5756804943084717, "learning_rate": 5e-05, "loss": 2.3048, "step": 127000 }, { "epoch": 0.13253961433043163, "grad_norm": 1.6020985841751099, "learning_rate": 5e-05, "loss": 2.3037, "step": 128000 }, { "epoch": 0.1335750800673881, "grad_norm": 1.2563997507095337, "learning_rate": 5e-05, "loss": 2.2906, "step": 129000 }, { "epoch": 0.13461054580434462, "grad_norm": 1.8040730953216553, "learning_rate": 5e-05, "loss": 2.2813, "step": 130000 }, { "epoch": 0.13461054580434462, "eval_loss": 2.2846579551696777, "eval_runtime": 19.9964, "eval_samples_per_second": 2473.3, "eval_steps_per_second": 9.702, "step": 130000 }, { "epoch": 0.1356460115413011, "grad_norm": 1.693924903869629, "learning_rate": 5e-05, "loss": 2.2897, "step": 131000 }, { "epoch": 0.1366814772782576, "grad_norm": 1.8027820587158203, "learning_rate": 5e-05, "loss": 2.2875, "step": 132000 }, { "epoch": 0.1377169430152141, "grad_norm": 1.7695696353912354, "learning_rate": 5e-05, "loss": 2.2833, "step": 133000 }, { "epoch": 0.1387524087521706, "grad_norm": 1.5467225313186646, "learning_rate": 5e-05, "loss": 2.2945, "step": 134000 }, { "epoch": 0.13978787448912708, "grad_norm": 1.5252859592437744, "learning_rate": 5e-05, "loss": 2.2698, "step": 135000 }, { "epoch": 0.13978787448912708, "eval_loss": 2.271275520324707, "eval_runtime": 18.9188, "eval_samples_per_second": 2614.178, "eval_steps_per_second": 10.254, "step": 135000 }, { "epoch": 0.1408233402260836, "grad_norm": 1.7169549465179443, "learning_rate": 5e-05, "loss": 2.2983, "step": 136000 }, { "epoch": 0.1418588059630401, "grad_norm": 1.4472647905349731, "learning_rate": 5e-05, "loss": 2.2863, "step": 137000 }, { "epoch": 0.14289427169999658, "grad_norm": 1.9395118951797485, "learning_rate": 5e-05, "loss": 2.2764, "step": 138000 }, { "epoch": 0.1439297374369531, "grad_norm": 1.5419905185699463, "learning_rate": 5e-05, "loss": 2.2753, "step": 139000 }, { "epoch": 0.14496520317390957, "grad_norm": 1.6046137809753418, "learning_rate": 5e-05, "loss": 2.2895, "step": 140000 }, { "epoch": 0.14496520317390957, "eval_loss": 2.2650840282440186, "eval_runtime": 17.554, "eval_samples_per_second": 2817.421, "eval_steps_per_second": 11.052, "step": 140000 }, { "epoch": 0.14600066891086608, "grad_norm": 1.755807638168335, "learning_rate": 5e-05, "loss": 2.2728, "step": 141000 }, { "epoch": 0.14703613464782256, "grad_norm": 2.676787853240967, "learning_rate": 5e-05, "loss": 2.2562, "step": 142000 }, { "epoch": 0.14807160038477907, "grad_norm": 1.503091812133789, "learning_rate": 5e-05, "loss": 2.256, "step": 143000 }, { "epoch": 0.14910706612173558, "grad_norm": 1.8047343492507935, "learning_rate": 5e-05, "loss": 2.2588, "step": 144000 }, { "epoch": 0.15014253185869206, "grad_norm": 1.6858866214752197, "learning_rate": 5e-05, "loss": 2.2605, "step": 145000 }, { "epoch": 0.15014253185869206, "eval_loss": 2.239438056945801, "eval_runtime": 30.413, "eval_samples_per_second": 1626.178, "eval_steps_per_second": 6.379, "step": 145000 }, { "epoch": 0.15117799759564857, "grad_norm": 1.4932698011398315, "learning_rate": 5e-05, "loss": 2.261, "step": 146000 }, { "epoch": 0.15221346333260505, "grad_norm": 1.627295732498169, "learning_rate": 5e-05, "loss": 2.2472, "step": 147000 }, { "epoch": 0.15324892906956156, "grad_norm": 1.7859046459197998, "learning_rate": 5e-05, "loss": 2.2594, "step": 148000 }, { "epoch": 0.15428439480651804, "grad_norm": 1.6636815071105957, "learning_rate": 5e-05, "loss": 2.2594, "step": 149000 }, { "epoch": 0.15531986054347455, "grad_norm": 2.078432083129883, "learning_rate": 5e-05, "loss": 2.256, "step": 150000 }, { "epoch": 0.15531986054347455, "eval_loss": 2.2415122985839844, "eval_runtime": 18.5954, "eval_samples_per_second": 2659.638, "eval_steps_per_second": 10.433, "step": 150000 }, { "epoch": 0.15635532628043106, "grad_norm": 1.2664598226547241, "learning_rate": 5e-05, "loss": 2.2497, "step": 151000 }, { "epoch": 0.15739079201738754, "grad_norm": 1.5538097620010376, "learning_rate": 5e-05, "loss": 2.2499, "step": 152000 }, { "epoch": 0.15842625775434405, "grad_norm": 1.8865635395050049, "learning_rate": 5e-05, "loss": 2.2489, "step": 153000 }, { "epoch": 0.15946172349130053, "grad_norm": 1.5606614351272583, "learning_rate": 5e-05, "loss": 2.2388, "step": 154000 }, { "epoch": 0.16049718922825704, "grad_norm": 1.9067413806915283, "learning_rate": 5e-05, "loss": 2.24, "step": 155000 }, { "epoch": 0.16049718922825704, "eval_loss": 2.2304790019989014, "eval_runtime": 17.7179, "eval_samples_per_second": 2791.36, "eval_steps_per_second": 10.949, "step": 155000 }, { "epoch": 0.16153265496521352, "grad_norm": 1.8727903366088867, "learning_rate": 5e-05, "loss": 2.2338, "step": 156000 }, { "epoch": 0.16256812070217003, "grad_norm": 1.8794825077056885, "learning_rate": 5e-05, "loss": 2.2472, "step": 157000 }, { "epoch": 0.16360358643912654, "grad_norm": 1.9586312770843506, "learning_rate": 5e-05, "loss": 2.2411, "step": 158000 }, { "epoch": 0.16463905217608302, "grad_norm": 2.5029993057250977, "learning_rate": 5e-05, "loss": 2.245, "step": 159000 }, { "epoch": 0.16567451791303953, "grad_norm": 1.5990197658538818, "learning_rate": 5e-05, "loss": 2.2239, "step": 160000 }, { "epoch": 0.16567451791303953, "eval_loss": 2.219214916229248, "eval_runtime": 19.7117, "eval_samples_per_second": 2509.014, "eval_steps_per_second": 9.842, "step": 160000 }, { "epoch": 0.166709983649996, "grad_norm": 2.100264072418213, "learning_rate": 5e-05, "loss": 2.2337, "step": 161000 }, { "epoch": 0.16774544938695252, "grad_norm": 1.7322697639465332, "learning_rate": 5e-05, "loss": 2.2222, "step": 162000 }, { "epoch": 0.168780915123909, "grad_norm": 1.6511707305908203, "learning_rate": 5e-05, "loss": 2.2287, "step": 163000 }, { "epoch": 0.1698163808608655, "grad_norm": 1.5296807289123535, "learning_rate": 5e-05, "loss": 2.2269, "step": 164000 }, { "epoch": 0.170851846597822, "grad_norm": 1.788960337638855, "learning_rate": 5e-05, "loss": 2.2237, "step": 165000 }, { "epoch": 0.170851846597822, "eval_loss": 2.2153329849243164, "eval_runtime": 18.4747, "eval_samples_per_second": 2677.015, "eval_steps_per_second": 10.501, "step": 165000 }, { "epoch": 0.1718873123347785, "grad_norm": 1.592215657234192, "learning_rate": 5e-05, "loss": 2.2246, "step": 166000 }, { "epoch": 0.172922778071735, "grad_norm": 1.7051317691802979, "learning_rate": 5e-05, "loss": 2.2075, "step": 167000 }, { "epoch": 0.1739582438086915, "grad_norm": 1.8867826461791992, "learning_rate": 5e-05, "loss": 2.2187, "step": 168000 }, { "epoch": 0.174993709545648, "grad_norm": 1.5261858701705933, "learning_rate": 5e-05, "loss": 2.2068, "step": 169000 }, { "epoch": 0.17602917528260448, "grad_norm": 1.7023552656173706, "learning_rate": 5e-05, "loss": 2.231, "step": 170000 }, { "epoch": 0.17602917528260448, "eval_loss": 2.200360059738159, "eval_runtime": 34.4793, "eval_samples_per_second": 1434.396, "eval_steps_per_second": 5.627, "step": 170000 }, { "epoch": 0.177064641019561, "grad_norm": 1.6329045295715332, "learning_rate": 5e-05, "loss": 2.217, "step": 171000 }, { "epoch": 0.17810010675651747, "grad_norm": 1.8211392164230347, "learning_rate": 5e-05, "loss": 2.2109, "step": 172000 }, { "epoch": 0.17913557249347398, "grad_norm": 1.8491740226745605, "learning_rate": 5e-05, "loss": 2.21, "step": 173000 }, { "epoch": 0.18017103823043049, "grad_norm": 1.5435434579849243, "learning_rate": 5e-05, "loss": 2.2026, "step": 174000 }, { "epoch": 0.18120650396738697, "grad_norm": 1.8491019010543823, "learning_rate": 5e-05, "loss": 2.2101, "step": 175000 }, { "epoch": 0.18120650396738697, "eval_loss": 2.1987149715423584, "eval_runtime": 18.5985, "eval_samples_per_second": 2659.196, "eval_steps_per_second": 10.431, "step": 175000 }, { "epoch": 0.18224196970434348, "grad_norm": 1.6454964876174927, "learning_rate": 5e-05, "loss": 2.2014, "step": 176000 }, { "epoch": 0.18327743544129996, "grad_norm": 1.5947151184082031, "learning_rate": 5e-05, "loss": 2.2077, "step": 177000 }, { "epoch": 0.18431290117825647, "grad_norm": 1.47206711769104, "learning_rate": 5e-05, "loss": 2.2087, "step": 178000 }, { "epoch": 0.18534836691521295, "grad_norm": 1.5847997665405273, "learning_rate": 5e-05, "loss": 2.2198, "step": 179000 }, { "epoch": 0.18638383265216946, "grad_norm": 1.543878436088562, "learning_rate": 5e-05, "loss": 2.198, "step": 180000 }, { "epoch": 0.18638383265216946, "eval_loss": 2.1845173835754395, "eval_runtime": 19.3852, "eval_samples_per_second": 2551.271, "eval_steps_per_second": 10.008, "step": 180000 }, { "epoch": 0.18741929838912597, "grad_norm": 2.046430826187134, "learning_rate": 5e-05, "loss": 2.1924, "step": 181000 }, { "epoch": 0.18845476412608245, "grad_norm": 1.3812047243118286, "learning_rate": 5e-05, "loss": 2.1962, "step": 182000 }, { "epoch": 0.18949022986303896, "grad_norm": 2.3460304737091064, "learning_rate": 5e-05, "loss": 2.2018, "step": 183000 }, { "epoch": 0.19052569559999544, "grad_norm": 1.5330251455307007, "learning_rate": 5e-05, "loss": 2.1828, "step": 184000 }, { "epoch": 0.19156116133695195, "grad_norm": 1.693895697593689, "learning_rate": 5e-05, "loss": 2.1763, "step": 185000 }, { "epoch": 0.19156116133695195, "eval_loss": 2.172133684158325, "eval_runtime": 19.1958, "eval_samples_per_second": 2576.448, "eval_steps_per_second": 10.106, "step": 185000 }, { "epoch": 0.19259662707390843, "grad_norm": 1.7285739183425903, "learning_rate": 5e-05, "loss": 2.197, "step": 186000 }, { "epoch": 0.19363209281086494, "grad_norm": 1.8407094478607178, "learning_rate": 5e-05, "loss": 2.1856, "step": 187000 }, { "epoch": 0.19466755854782145, "grad_norm": 1.7621251344680786, "learning_rate": 5e-05, "loss": 2.1986, "step": 188000 }, { "epoch": 0.19570302428477793, "grad_norm": 1.4718973636627197, "learning_rate": 5e-05, "loss": 2.1729, "step": 189000 }, { "epoch": 0.19673849002173444, "grad_norm": 2.0074100494384766, "learning_rate": 5e-05, "loss": 2.1893, "step": 190000 }, { "epoch": 0.19673849002173444, "eval_loss": 2.172874927520752, "eval_runtime": 18.8943, "eval_samples_per_second": 2617.559, "eval_steps_per_second": 10.268, "step": 190000 }, { "epoch": 0.19777395575869092, "grad_norm": 1.919875144958496, "learning_rate": 5e-05, "loss": 2.1847, "step": 191000 }, { "epoch": 0.19880942149564743, "grad_norm": 1.9104081392288208, "learning_rate": 5e-05, "loss": 2.1772, "step": 192000 }, { "epoch": 0.1998448872326039, "grad_norm": 1.5770833492279053, "learning_rate": 5e-05, "loss": 2.1737, "step": 193000 }, { "epoch": 0.20088035296956042, "grad_norm": 1.6979726552963257, "learning_rate": 5e-05, "loss": 2.1754, "step": 194000 }, { "epoch": 0.2019158187065169, "grad_norm": 1.5536943674087524, "learning_rate": 5e-05, "loss": 2.1688, "step": 195000 }, { "epoch": 0.2019158187065169, "eval_loss": 2.163553476333618, "eval_runtime": 21.8229, "eval_samples_per_second": 2266.291, "eval_steps_per_second": 8.89, "step": 195000 }, { "epoch": 0.2029512844434734, "grad_norm": 1.6107367277145386, "learning_rate": 5e-05, "loss": 2.1824, "step": 196000 }, { "epoch": 0.20398675018042992, "grad_norm": 1.4322165250778198, "learning_rate": 5e-05, "loss": 2.1698, "step": 197000 }, { "epoch": 0.2050222159173864, "grad_norm": 1.6819088459014893, "learning_rate": 5e-05, "loss": 2.1653, "step": 198000 }, { "epoch": 0.2060576816543429, "grad_norm": 1.6152656078338623, "learning_rate": 5e-05, "loss": 2.1617, "step": 199000 }, { "epoch": 0.2070931473912994, "grad_norm": 1.5925724506378174, "learning_rate": 5e-05, "loss": 2.1628, "step": 200000 }, { "epoch": 0.2070931473912994, "eval_loss": 2.1573803424835205, "eval_runtime": 19.1084, "eval_samples_per_second": 2588.231, "eval_steps_per_second": 10.153, "step": 200000 }, { "epoch": 0.2081286131282559, "grad_norm": 2.013730049133301, "learning_rate": 5e-05, "loss": 2.1803, "step": 201000 }, { "epoch": 0.20916407886521238, "grad_norm": 1.5648040771484375, "learning_rate": 5e-05, "loss": 2.1807, "step": 202000 }, { "epoch": 0.2101995446021689, "grad_norm": 1.5546032190322876, "learning_rate": 5e-05, "loss": 2.1536, "step": 203000 }, { "epoch": 0.2112350103391254, "grad_norm": 1.5997415781021118, "learning_rate": 5e-05, "loss": 2.1622, "step": 204000 }, { "epoch": 0.21227047607608188, "grad_norm": 1.656101942062378, "learning_rate": 5e-05, "loss": 2.1599, "step": 205000 }, { "epoch": 0.21227047607608188, "eval_loss": 2.156127452850342, "eval_runtime": 19.0436, "eval_samples_per_second": 2597.044, "eval_steps_per_second": 10.187, "step": 205000 }, { "epoch": 0.21330594181303839, "grad_norm": 1.6793824434280396, "learning_rate": 5e-05, "loss": 2.1545, "step": 206000 }, { "epoch": 0.21434140754999487, "grad_norm": 2.1217029094696045, "learning_rate": 5e-05, "loss": 2.1508, "step": 207000 }, { "epoch": 0.21537687328695138, "grad_norm": 1.2719789743423462, "learning_rate": 5e-05, "loss": 2.1573, "step": 208000 }, { "epoch": 0.21641233902390786, "grad_norm": 1.5386024713516235, "learning_rate": 5e-05, "loss": 2.1478, "step": 209000 }, { "epoch": 0.21744780476086437, "grad_norm": 2.2390332221984863, "learning_rate": 5e-05, "loss": 2.1593, "step": 210000 }, { "epoch": 0.21744780476086437, "eval_loss": 2.1487836837768555, "eval_runtime": 18.2274, "eval_samples_per_second": 2713.328, "eval_steps_per_second": 10.643, "step": 210000 }, { "epoch": 0.21848327049782088, "grad_norm": 1.6317909955978394, "learning_rate": 5e-05, "loss": 2.1578, "step": 211000 }, { "epoch": 0.21951873623477736, "grad_norm": 1.6397149562835693, "learning_rate": 5e-05, "loss": 2.1567, "step": 212000 }, { "epoch": 0.22055420197173387, "grad_norm": 1.6141470670700073, "learning_rate": 5e-05, "loss": 2.1493, "step": 213000 }, { "epoch": 0.22158966770869035, "grad_norm": 1.6077841520309448, "learning_rate": 5e-05, "loss": 2.1383, "step": 214000 }, { "epoch": 0.22262513344564686, "grad_norm": 1.4669524431228638, "learning_rate": 5e-05, "loss": 2.1449, "step": 215000 }, { "epoch": 0.22262513344564686, "eval_loss": 2.1384260654449463, "eval_runtime": 18.3245, "eval_samples_per_second": 2698.962, "eval_steps_per_second": 10.587, "step": 215000 }, { "epoch": 0.22366059918260334, "grad_norm": 1.6204700469970703, "learning_rate": 5e-05, "loss": 2.1601, "step": 216000 }, { "epoch": 0.22469606491955985, "grad_norm": 1.33708655834198, "learning_rate": 5e-05, "loss": 2.1419, "step": 217000 }, { "epoch": 0.22573153065651635, "grad_norm": 1.765945315361023, "learning_rate": 5e-05, "loss": 2.1406, "step": 218000 }, { "epoch": 0.22676699639347284, "grad_norm": 1.6463714838027954, "learning_rate": 5e-05, "loss": 2.1533, "step": 219000 }, { "epoch": 0.22780246213042935, "grad_norm": 1.5012264251708984, "learning_rate": 5e-05, "loss": 2.1446, "step": 220000 }, { "epoch": 0.22780246213042935, "eval_loss": 2.1217093467712402, "eval_runtime": 27.5974, "eval_samples_per_second": 1792.091, "eval_steps_per_second": 7.03, "step": 220000 }, { "epoch": 0.22883792786738583, "grad_norm": 1.6473504304885864, "learning_rate": 5e-05, "loss": 2.1512, "step": 221000 }, { "epoch": 0.22987339360434234, "grad_norm": 1.60206139087677, "learning_rate": 5e-05, "loss": 2.1426, "step": 222000 }, { "epoch": 0.23090885934129882, "grad_norm": 1.8370537757873535, "learning_rate": 5e-05, "loss": 2.1438, "step": 223000 }, { "epoch": 0.23194432507825533, "grad_norm": 1.3373557329177856, "learning_rate": 5e-05, "loss": 2.1315, "step": 224000 }, { "epoch": 0.2329797908152118, "grad_norm": 1.6836400032043457, "learning_rate": 5e-05, "loss": 2.1354, "step": 225000 }, { "epoch": 0.2329797908152118, "eval_loss": 2.125302791595459, "eval_runtime": 22.4712, "eval_samples_per_second": 2200.902, "eval_steps_per_second": 8.633, "step": 225000 }, { "epoch": 0.23401525655216832, "grad_norm": 1.7297096252441406, "learning_rate": 5e-05, "loss": 2.1325, "step": 226000 }, { "epoch": 0.23505072228912482, "grad_norm": 1.4455069303512573, "learning_rate": 5e-05, "loss": 2.1299, "step": 227000 }, { "epoch": 0.2360861880260813, "grad_norm": 1.429310917854309, "learning_rate": 5e-05, "loss": 2.1457, "step": 228000 }, { "epoch": 0.23712165376303782, "grad_norm": 1.7447959184646606, "learning_rate": 5e-05, "loss": 2.1349, "step": 229000 }, { "epoch": 0.2381571194999943, "grad_norm": 1.6983305215835571, "learning_rate": 5e-05, "loss": 2.1201, "step": 230000 }, { "epoch": 0.2381571194999943, "eval_loss": 2.1202392578125, "eval_runtime": 19.1303, "eval_samples_per_second": 2585.264, "eval_steps_per_second": 10.141, "step": 230000 }, { "epoch": 0.2391925852369508, "grad_norm": 1.5676405429840088, "learning_rate": 5e-05, "loss": 2.1336, "step": 231000 }, { "epoch": 0.2402280509739073, "grad_norm": 1.6670252084732056, "learning_rate": 5e-05, "loss": 2.1187, "step": 232000 }, { "epoch": 0.2412635167108638, "grad_norm": 1.5296714305877686, "learning_rate": 5e-05, "loss": 2.125, "step": 233000 }, { "epoch": 0.2422989824478203, "grad_norm": 1.5627477169036865, "learning_rate": 5e-05, "loss": 2.121, "step": 234000 }, { "epoch": 0.24333444818477679, "grad_norm": 2.2698166370391846, "learning_rate": 5e-05, "loss": 2.1426, "step": 235000 }, { "epoch": 0.24333444818477679, "eval_loss": 2.1195831298828125, "eval_runtime": 18.9698, "eval_samples_per_second": 2607.149, "eval_steps_per_second": 10.227, "step": 235000 }, { "epoch": 0.2443699139217333, "grad_norm": 1.9312177896499634, "learning_rate": 5e-05, "loss": 2.1289, "step": 236000 }, { "epoch": 0.24540537965868978, "grad_norm": 1.6321558952331543, "learning_rate": 5e-05, "loss": 2.1236, "step": 237000 }, { "epoch": 0.24644084539564629, "grad_norm": 1.7519464492797852, "learning_rate": 5e-05, "loss": 2.1175, "step": 238000 }, { "epoch": 0.24747631113260277, "grad_norm": 1.5893385410308838, "learning_rate": 5e-05, "loss": 2.1175, "step": 239000 }, { "epoch": 0.24851177686955928, "grad_norm": 1.7425572872161865, "learning_rate": 5e-05, "loss": 2.1125, "step": 240000 }, { "epoch": 0.24851177686955928, "eval_loss": 2.1114916801452637, "eval_runtime": 18.5753, "eval_samples_per_second": 2662.508, "eval_steps_per_second": 10.444, "step": 240000 }, { "epoch": 0.24954724260651578, "grad_norm": 1.7862157821655273, "learning_rate": 5e-05, "loss": 2.1228, "step": 241000 }, { "epoch": 0.2505827083434723, "grad_norm": 1.2887235879898071, "learning_rate": 5e-05, "loss": 2.1298, "step": 242000 }, { "epoch": 0.25161817408042875, "grad_norm": 1.5348347425460815, "learning_rate": 5e-05, "loss": 2.1247, "step": 243000 }, { "epoch": 0.25265363981738526, "grad_norm": 1.669761061668396, "learning_rate": 5e-05, "loss": 2.1172, "step": 244000 }, { "epoch": 0.25368910555434177, "grad_norm": 1.881727933883667, "learning_rate": 5e-05, "loss": 2.1118, "step": 245000 }, { "epoch": 0.25368910555434177, "eval_loss": 2.103923797607422, "eval_runtime": 19.0175, "eval_samples_per_second": 2600.599, "eval_steps_per_second": 10.201, "step": 245000 }, { "epoch": 0.2547245712912983, "grad_norm": 1.744746446609497, "learning_rate": 5e-05, "loss": 2.1077, "step": 246000 }, { "epoch": 0.2557600370282547, "grad_norm": 1.7865989208221436, "learning_rate": 5e-05, "loss": 2.1147, "step": 247000 }, { "epoch": 0.25679550276521124, "grad_norm": 1.640703558921814, "learning_rate": 5e-05, "loss": 2.1066, "step": 248000 }, { "epoch": 0.25783096850216775, "grad_norm": 1.8829026222229004, "learning_rate": 5e-05, "loss": 2.1118, "step": 249000 }, { "epoch": 0.25886643423912425, "grad_norm": 1.4332600831985474, "learning_rate": 5e-05, "loss": 2.1051, "step": 250000 }, { "epoch": 0.25886643423912425, "eval_loss": 2.097465753555298, "eval_runtime": 18.8905, "eval_samples_per_second": 2618.088, "eval_steps_per_second": 10.27, "step": 250000 }, { "epoch": 0.25990189997608076, "grad_norm": 1.1934382915496826, "learning_rate": 5e-05, "loss": 2.1017, "step": 251000 }, { "epoch": 0.2609373657130372, "grad_norm": 1.7838383913040161, "learning_rate": 5e-05, "loss": 2.1196, "step": 252000 }, { "epoch": 0.2619728314499937, "grad_norm": 1.6719322204589844, "learning_rate": 5e-05, "loss": 2.1117, "step": 253000 }, { "epoch": 0.26300829718695024, "grad_norm": 1.5883870124816895, "learning_rate": 5e-05, "loss": 2.1019, "step": 254000 }, { "epoch": 0.26404376292390674, "grad_norm": 1.5117872953414917, "learning_rate": 5e-05, "loss": 2.1132, "step": 255000 }, { "epoch": 0.26404376292390674, "eval_loss": 2.0989463329315186, "eval_runtime": 21.0632, "eval_samples_per_second": 2348.034, "eval_steps_per_second": 9.21, "step": 255000 }, { "epoch": 0.26507922866086325, "grad_norm": 1.7486399412155151, "learning_rate": 5e-05, "loss": 2.1182, "step": 256000 }, { "epoch": 0.2661146943978197, "grad_norm": 1.401721477508545, "learning_rate": 5e-05, "loss": 2.1081, "step": 257000 }, { "epoch": 0.2671501601347762, "grad_norm": 1.6972362995147705, "learning_rate": 5e-05, "loss": 2.0914, "step": 258000 }, { "epoch": 0.2681856258717327, "grad_norm": 1.5844149589538574, "learning_rate": 5e-05, "loss": 2.0885, "step": 259000 }, { "epoch": 0.26922109160868923, "grad_norm": 1.492384910583496, "learning_rate": 5e-05, "loss": 2.0961, "step": 260000 }, { "epoch": 0.26922109160868923, "eval_loss": 2.090730905532837, "eval_runtime": 20.2892, "eval_samples_per_second": 2437.604, "eval_steps_per_second": 9.562, "step": 260000 }, { "epoch": 0.2702565573456457, "grad_norm": 1.7390309572219849, "learning_rate": 5e-05, "loss": 2.1005, "step": 261000 }, { "epoch": 0.2712920230826022, "grad_norm": 1.9608066082000732, "learning_rate": 5e-05, "loss": 2.0958, "step": 262000 }, { "epoch": 0.2723274888195587, "grad_norm": 1.6389927864074707, "learning_rate": 5e-05, "loss": 2.1028, "step": 263000 }, { "epoch": 0.2733629545565152, "grad_norm": 1.43370521068573, "learning_rate": 5e-05, "loss": 2.0861, "step": 264000 }, { "epoch": 0.2743984202934717, "grad_norm": 1.4951225519180298, "learning_rate": 5e-05, "loss": 2.0864, "step": 265000 }, { "epoch": 0.2743984202934717, "eval_loss": 2.0853793621063232, "eval_runtime": 18.9558, "eval_samples_per_second": 2609.067, "eval_steps_per_second": 10.234, "step": 265000 }, { "epoch": 0.2754338860304282, "grad_norm": 1.507961392402649, "learning_rate": 5e-05, "loss": 2.0915, "step": 266000 }, { "epoch": 0.2764693517673847, "grad_norm": 1.930547833442688, "learning_rate": 5e-05, "loss": 2.0845, "step": 267000 }, { "epoch": 0.2775048175043412, "grad_norm": 1.947425127029419, "learning_rate": 5e-05, "loss": 2.0861, "step": 268000 }, { "epoch": 0.2785402832412977, "grad_norm": 1.6057015657424927, "learning_rate": 5e-05, "loss": 2.1003, "step": 269000 }, { "epoch": 0.27957574897825416, "grad_norm": 1.6992744207382202, "learning_rate": 5e-05, "loss": 2.0844, "step": 270000 }, { "epoch": 0.27957574897825416, "eval_loss": 2.0863852500915527, "eval_runtime": 19.2536, "eval_samples_per_second": 2568.715, "eval_steps_per_second": 10.076, "step": 270000 }, { "epoch": 0.28061121471521067, "grad_norm": 1.6745386123657227, "learning_rate": 5e-05, "loss": 2.0863, "step": 271000 }, { "epoch": 0.2816466804521672, "grad_norm": 1.480391502380371, "learning_rate": 5e-05, "loss": 2.0832, "step": 272000 }, { "epoch": 0.2826821461891237, "grad_norm": 1.7411426305770874, "learning_rate": 5e-05, "loss": 2.0885, "step": 273000 }, { "epoch": 0.2837176119260802, "grad_norm": 1.5064642429351807, "learning_rate": 5e-05, "loss": 2.096, "step": 274000 }, { "epoch": 0.28475307766303665, "grad_norm": 1.3575658798217773, "learning_rate": 5e-05, "loss": 2.0908, "step": 275000 }, { "epoch": 0.28475307766303665, "eval_loss": 2.075481414794922, "eval_runtime": 21.5208, "eval_samples_per_second": 2298.108, "eval_steps_per_second": 9.015, "step": 275000 }, { "epoch": 0.28578854339999316, "grad_norm": 1.7280150651931763, "learning_rate": 5e-05, "loss": 2.0925, "step": 276000 }, { "epoch": 0.28682400913694966, "grad_norm": 1.7355035543441772, "learning_rate": 5e-05, "loss": 2.0832, "step": 277000 }, { "epoch": 0.2878594748739062, "grad_norm": 1.594399094581604, "learning_rate": 5e-05, "loss": 2.0874, "step": 278000 }, { "epoch": 0.2888949406108627, "grad_norm": 1.5549061298370361, "learning_rate": 5e-05, "loss": 2.0836, "step": 279000 }, { "epoch": 0.28993040634781914, "grad_norm": 1.8054208755493164, "learning_rate": 5e-05, "loss": 2.0783, "step": 280000 }, { "epoch": 0.28993040634781914, "eval_loss": 2.067439317703247, "eval_runtime": 19.2353, "eval_samples_per_second": 2571.156, "eval_steps_per_second": 10.086, "step": 280000 }, { "epoch": 0.29096587208477565, "grad_norm": 1.4210501909255981, "learning_rate": 5e-05, "loss": 2.0854, "step": 281000 }, { "epoch": 0.29200133782173215, "grad_norm": 1.9252066612243652, "learning_rate": 5e-05, "loss": 2.083, "step": 282000 }, { "epoch": 0.29303680355868866, "grad_norm": 1.6770497560501099, "learning_rate": 5e-05, "loss": 2.087, "step": 283000 }, { "epoch": 0.2940722692956451, "grad_norm": 1.5951564311981201, "learning_rate": 5e-05, "loss": 2.0812, "step": 284000 }, { "epoch": 0.2951077350326016, "grad_norm": 1.6986280679702759, "learning_rate": 5e-05, "loss": 2.0786, "step": 285000 }, { "epoch": 0.2951077350326016, "eval_loss": 2.0688838958740234, "eval_runtime": 18.9767, "eval_samples_per_second": 2606.195, "eval_steps_per_second": 10.223, "step": 285000 }, { "epoch": 0.29614320076955813, "grad_norm": 1.6687910556793213, "learning_rate": 5e-05, "loss": 2.088, "step": 286000 }, { "epoch": 0.29717866650651464, "grad_norm": 1.308746337890625, "learning_rate": 5e-05, "loss": 2.0736, "step": 287000 }, { "epoch": 0.29821413224347115, "grad_norm": 1.5213390588760376, "learning_rate": 5e-05, "loss": 2.0774, "step": 288000 }, { "epoch": 0.2992495979804276, "grad_norm": 1.8728256225585938, "learning_rate": 5e-05, "loss": 2.0737, "step": 289000 }, { "epoch": 0.3002850637173841, "grad_norm": 1.5906774997711182, "learning_rate": 5e-05, "loss": 2.0739, "step": 290000 }, { "epoch": 0.3002850637173841, "eval_loss": 2.061387538909912, "eval_runtime": 34.9804, "eval_samples_per_second": 1413.847, "eval_steps_per_second": 5.546, "step": 290000 }, { "epoch": 0.3013205294543406, "grad_norm": 1.412349820137024, "learning_rate": 5e-05, "loss": 2.0672, "step": 291000 }, { "epoch": 0.30235599519129713, "grad_norm": 1.9999415874481201, "learning_rate": 5e-05, "loss": 2.0611, "step": 292000 }, { "epoch": 0.3033914609282536, "grad_norm": 1.4650294780731201, "learning_rate": 5e-05, "loss": 2.0698, "step": 293000 }, { "epoch": 0.3044269266652101, "grad_norm": 2.1507210731506348, "learning_rate": 5e-05, "loss": 2.0636, "step": 294000 }, { "epoch": 0.3054623924021666, "grad_norm": 1.7981712818145752, "learning_rate": 5e-05, "loss": 2.0701, "step": 295000 }, { "epoch": 0.3054623924021666, "eval_loss": 2.0662500858306885, "eval_runtime": 20.4231, "eval_samples_per_second": 2421.625, "eval_steps_per_second": 9.499, "step": 295000 }, { "epoch": 0.3064978581391231, "grad_norm": 1.7765494585037231, "learning_rate": 5e-05, "loss": 2.0748, "step": 296000 }, { "epoch": 0.3075333238760796, "grad_norm": 1.7243373394012451, "learning_rate": 5e-05, "loss": 2.0734, "step": 297000 }, { "epoch": 0.3085687896130361, "grad_norm": 1.7474713325500488, "learning_rate": 5e-05, "loss": 2.0665, "step": 298000 }, { "epoch": 0.3096042553499926, "grad_norm": 1.820787787437439, "learning_rate": 5e-05, "loss": 2.069, "step": 299000 }, { "epoch": 0.3106397210869491, "grad_norm": 1.5175533294677734, "learning_rate": 5e-05, "loss": 2.0667, "step": 300000 }, { "epoch": 0.3106397210869491, "eval_loss": 2.058307647705078, "eval_runtime": 20.3497, "eval_samples_per_second": 2430.357, "eval_steps_per_second": 9.533, "step": 300000 }, { "epoch": 0.3116751868239056, "grad_norm": 1.475821852684021, "learning_rate": 5e-05, "loss": 2.0683, "step": 301000 }, { "epoch": 0.3127106525608621, "grad_norm": 1.4600526094436646, "learning_rate": 5e-05, "loss": 2.0679, "step": 302000 }, { "epoch": 0.31374611829781857, "grad_norm": 1.5122923851013184, "learning_rate": 5e-05, "loss": 2.0663, "step": 303000 }, { "epoch": 0.3147815840347751, "grad_norm": 1.6594598293304443, "learning_rate": 5e-05, "loss": 2.0575, "step": 304000 }, { "epoch": 0.3158170497717316, "grad_norm": 1.8523794412612915, "learning_rate": 5e-05, "loss": 2.0694, "step": 305000 }, { "epoch": 0.3158170497717316, "eval_loss": 2.060145616531372, "eval_runtime": 18.7888, "eval_samples_per_second": 2632.265, "eval_steps_per_second": 10.325, "step": 305000 }, { "epoch": 0.3168525155086881, "grad_norm": 1.1590874195098877, "learning_rate": 5e-05, "loss": 2.0648, "step": 306000 }, { "epoch": 0.31788798124564455, "grad_norm": 1.629807472229004, "learning_rate": 5e-05, "loss": 2.0632, "step": 307000 }, { "epoch": 0.31892344698260106, "grad_norm": 1.2362703084945679, "learning_rate": 5e-05, "loss": 2.052, "step": 308000 }, { "epoch": 0.31995891271955756, "grad_norm": 1.9399092197418213, "learning_rate": 5e-05, "loss": 2.0559, "step": 309000 }, { "epoch": 0.3209943784565141, "grad_norm": 1.528998613357544, "learning_rate": 5e-05, "loss": 2.0601, "step": 310000 }, { "epoch": 0.3209943784565141, "eval_loss": 2.0462470054626465, "eval_runtime": 19.5, "eval_samples_per_second": 2536.255, "eval_steps_per_second": 9.949, "step": 310000 }, { "epoch": 0.3220298441934706, "grad_norm": 1.3977261781692505, "learning_rate": 5e-05, "loss": 2.0534, "step": 311000 }, { "epoch": 0.32306530993042704, "grad_norm": 1.4338626861572266, "learning_rate": 5e-05, "loss": 2.0646, "step": 312000 }, { "epoch": 0.32410077566738354, "grad_norm": 1.7435204982757568, "learning_rate": 5e-05, "loss": 2.0523, "step": 313000 }, { "epoch": 0.32513624140434005, "grad_norm": 1.6941169500350952, "learning_rate": 5e-05, "loss": 2.0627, "step": 314000 }, { "epoch": 0.32617170714129656, "grad_norm": 1.7250036001205444, "learning_rate": 5e-05, "loss": 2.0623, "step": 315000 }, { "epoch": 0.32617170714129656, "eval_loss": 2.0483994483947754, "eval_runtime": 20.1553, "eval_samples_per_second": 2453.798, "eval_steps_per_second": 9.625, "step": 315000 }, { "epoch": 0.32720717287825307, "grad_norm": 1.5633450746536255, "learning_rate": 5e-05, "loss": 2.0504, "step": 316000 }, { "epoch": 0.3282426386152095, "grad_norm": 1.9888895750045776, "learning_rate": 5e-05, "loss": 2.051, "step": 317000 }, { "epoch": 0.32927810435216603, "grad_norm": 1.6756165027618408, "learning_rate": 5e-05, "loss": 2.0573, "step": 318000 }, { "epoch": 0.33031357008912254, "grad_norm": 1.574082851409912, "learning_rate": 5e-05, "loss": 2.0576, "step": 319000 }, { "epoch": 0.33134903582607905, "grad_norm": 1.7257170677185059, "learning_rate": 5e-05, "loss": 2.0676, "step": 320000 }, { "epoch": 0.33134903582607905, "eval_loss": 2.0458145141601562, "eval_runtime": 20.2597, "eval_samples_per_second": 2441.148, "eval_steps_per_second": 9.576, "step": 320000 }, { "epoch": 0.3323845015630355, "grad_norm": 1.3943850994110107, "learning_rate": 5e-05, "loss": 2.046, "step": 321000 }, { "epoch": 0.333419967299992, "grad_norm": 1.759932279586792, "learning_rate": 5e-05, "loss": 2.0438, "step": 322000 }, { "epoch": 0.3344554330369485, "grad_norm": 1.7004214525222778, "learning_rate": 5e-05, "loss": 2.0472, "step": 323000 }, { "epoch": 0.33549089877390503, "grad_norm": 1.6331087350845337, "learning_rate": 5e-05, "loss": 2.0518, "step": 324000 }, { "epoch": 0.33652636451086154, "grad_norm": 1.9464713335037231, "learning_rate": 5e-05, "loss": 2.0404, "step": 325000 }, { "epoch": 0.33652636451086154, "eval_loss": 2.0383198261260986, "eval_runtime": 19.6827, "eval_samples_per_second": 2512.719, "eval_steps_per_second": 9.856, "step": 325000 }, { "epoch": 0.337561830247818, "grad_norm": 1.6935348510742188, "learning_rate": 5e-05, "loss": 2.0541, "step": 326000 }, { "epoch": 0.3385972959847745, "grad_norm": 1.6641876697540283, "learning_rate": 5e-05, "loss": 2.0433, "step": 327000 }, { "epoch": 0.339632761721731, "grad_norm": 1.943715214729309, "learning_rate": 5e-05, "loss": 2.0469, "step": 328000 }, { "epoch": 0.3406682274586875, "grad_norm": 1.791492223739624, "learning_rate": 5e-05, "loss": 2.0469, "step": 329000 }, { "epoch": 0.341703693195644, "grad_norm": 1.4165565967559814, "learning_rate": 5e-05, "loss": 2.0403, "step": 330000 }, { "epoch": 0.341703693195644, "eval_loss": 2.038299083709717, "eval_runtime": 19.6239, "eval_samples_per_second": 2520.239, "eval_steps_per_second": 9.886, "step": 330000 }, { "epoch": 0.3427391589326005, "grad_norm": 1.9569101333618164, "learning_rate": 5e-05, "loss": 2.0615, "step": 331000 }, { "epoch": 0.343774624669557, "grad_norm": 1.3618338108062744, "learning_rate": 5e-05, "loss": 2.0404, "step": 332000 }, { "epoch": 0.3448100904065135, "grad_norm": 1.8879374265670776, "learning_rate": 5e-05, "loss": 2.0388, "step": 333000 }, { "epoch": 0.34584555614347, "grad_norm": 1.2820725440979004, "learning_rate": 5e-05, "loss": 2.0477, "step": 334000 }, { "epoch": 0.34688102188042647, "grad_norm": 1.3401552438735962, "learning_rate": 5e-05, "loss": 2.0438, "step": 335000 }, { "epoch": 0.34688102188042647, "eval_loss": 2.0328009128570557, "eval_runtime": 26.3779, "eval_samples_per_second": 1874.94, "eval_steps_per_second": 7.355, "step": 335000 }, { "epoch": 0.347916487617383, "grad_norm": 1.425150752067566, "learning_rate": 5e-05, "loss": 2.0465, "step": 336000 }, { "epoch": 0.3489519533543395, "grad_norm": 1.5219414234161377, "learning_rate": 5e-05, "loss": 2.0342, "step": 337000 }, { "epoch": 0.349987419091296, "grad_norm": 1.5171096324920654, "learning_rate": 5e-05, "loss": 2.0322, "step": 338000 }, { "epoch": 0.3510228848282525, "grad_norm": 1.8049135208129883, "learning_rate": 5e-05, "loss": 2.0376, "step": 339000 }, { "epoch": 0.35205835056520896, "grad_norm": 1.9653050899505615, "learning_rate": 5e-05, "loss": 2.0384, "step": 340000 }, { "epoch": 0.35205835056520896, "eval_loss": 2.021156072616577, "eval_runtime": 20.2299, "eval_samples_per_second": 2444.746, "eval_steps_per_second": 9.59, "step": 340000 }, { "epoch": 0.35309381630216546, "grad_norm": 1.5175777673721313, "learning_rate": 5e-05, "loss": 2.0384, "step": 341000 }, { "epoch": 0.354129282039122, "grad_norm": 1.6402604579925537, "learning_rate": 5e-05, "loss": 2.0403, "step": 342000 }, { "epoch": 0.3551647477760785, "grad_norm": 1.7806463241577148, "learning_rate": 5e-05, "loss": 2.0435, "step": 343000 }, { "epoch": 0.35620021351303494, "grad_norm": 1.825314998626709, "learning_rate": 5e-05, "loss": 2.0337, "step": 344000 }, { "epoch": 0.35723567924999144, "grad_norm": 1.6913727521896362, "learning_rate": 5e-05, "loss": 2.0374, "step": 345000 }, { "epoch": 0.35723567924999144, "eval_loss": 2.018705368041992, "eval_runtime": 19.9799, "eval_samples_per_second": 2475.341, "eval_steps_per_second": 9.71, "step": 345000 }, { "epoch": 0.35827114498694795, "grad_norm": 1.3637776374816895, "learning_rate": 5e-05, "loss": 2.0404, "step": 346000 }, { "epoch": 0.35930661072390446, "grad_norm": 1.955057978630066, "learning_rate": 5e-05, "loss": 2.0286, "step": 347000 }, { "epoch": 0.36034207646086097, "grad_norm": 1.516166090965271, "learning_rate": 5e-05, "loss": 2.0311, "step": 348000 }, { "epoch": 0.3613775421978174, "grad_norm": 2.4298129081726074, "learning_rate": 5e-05, "loss": 2.0348, "step": 349000 }, { "epoch": 0.36241300793477393, "grad_norm": 1.5734537839889526, "learning_rate": 5e-05, "loss": 2.0333, "step": 350000 }, { "epoch": 0.36241300793477393, "eval_loss": 2.019728422164917, "eval_runtime": 19.4884, "eval_samples_per_second": 2537.767, "eval_steps_per_second": 9.955, "step": 350000 }, { "epoch": 0.36344847367173044, "grad_norm": 1.2852146625518799, "learning_rate": 5e-05, "loss": 2.0381, "step": 351000 }, { "epoch": 0.36448393940868695, "grad_norm": 1.62471342086792, "learning_rate": 5e-05, "loss": 2.0375, "step": 352000 }, { "epoch": 0.3655194051456434, "grad_norm": 1.6946029663085938, "learning_rate": 5e-05, "loss": 2.0291, "step": 353000 }, { "epoch": 0.3665548708825999, "grad_norm": 1.8819012641906738, "learning_rate": 5e-05, "loss": 2.0251, "step": 354000 }, { "epoch": 0.3675903366195564, "grad_norm": 1.8506168127059937, "learning_rate": 5e-05, "loss": 2.0297, "step": 355000 }, { "epoch": 0.3675903366195564, "eval_loss": 2.0166077613830566, "eval_runtime": 20.1725, "eval_samples_per_second": 2451.7, "eval_steps_per_second": 9.617, "step": 355000 }, { "epoch": 0.36862580235651293, "grad_norm": 1.6591482162475586, "learning_rate": 5e-05, "loss": 2.0279, "step": 356000 }, { "epoch": 0.36966126809346944, "grad_norm": 1.9256037473678589, "learning_rate": 5e-05, "loss": 2.0314, "step": 357000 }, { "epoch": 0.3706967338304259, "grad_norm": 1.5882221460342407, "learning_rate": 5e-05, "loss": 2.0365, "step": 358000 }, { "epoch": 0.3717321995673824, "grad_norm": 1.8153852224349976, "learning_rate": 5e-05, "loss": 2.0201, "step": 359000 }, { "epoch": 0.3727676653043389, "grad_norm": 1.7015492916107178, "learning_rate": 5e-05, "loss": 2.027, "step": 360000 }, { "epoch": 0.3727676653043389, "eval_loss": 2.019824504852295, "eval_runtime": 20.5579, "eval_samples_per_second": 2405.737, "eval_steps_per_second": 9.437, "step": 360000 }, { "epoch": 0.3738031310412954, "grad_norm": 1.973598837852478, "learning_rate": 5e-05, "loss": 2.0345, "step": 361000 }, { "epoch": 0.37483859677825193, "grad_norm": 1.685168981552124, "learning_rate": 5e-05, "loss": 2.0336, "step": 362000 }, { "epoch": 0.3758740625152084, "grad_norm": 1.4376262426376343, "learning_rate": 5e-05, "loss": 2.017, "step": 363000 }, { "epoch": 0.3769095282521649, "grad_norm": 1.309055209159851, "learning_rate": 5e-05, "loss": 2.0348, "step": 364000 }, { "epoch": 0.3779449939891214, "grad_norm": 1.5164344310760498, "learning_rate": 5e-05, "loss": 2.034, "step": 365000 }, { "epoch": 0.3779449939891214, "eval_loss": 2.0100014209747314, "eval_runtime": 20.1879, "eval_samples_per_second": 2449.835, "eval_steps_per_second": 9.61, "step": 365000 }, { "epoch": 0.3789804597260779, "grad_norm": 1.476970911026001, "learning_rate": 5e-05, "loss": 2.0214, "step": 366000 }, { "epoch": 0.38001592546303437, "grad_norm": 1.4336302280426025, "learning_rate": 5e-05, "loss": 2.0237, "step": 367000 }, { "epoch": 0.3810513911999909, "grad_norm": 1.7313731908798218, "learning_rate": 5e-05, "loss": 2.0215, "step": 368000 }, { "epoch": 0.3820868569369474, "grad_norm": 2.014364719390869, "learning_rate": 5e-05, "loss": 2.0105, "step": 369000 }, { "epoch": 0.3831223226739039, "grad_norm": 1.2657063007354736, "learning_rate": 5e-05, "loss": 2.03, "step": 370000 }, { "epoch": 0.3831223226739039, "eval_loss": 2.0104730129241943, "eval_runtime": 19.5262, "eval_samples_per_second": 2532.856, "eval_steps_per_second": 9.935, "step": 370000 }, { "epoch": 0.3841577884108604, "grad_norm": 2.069312334060669, "learning_rate": 5e-05, "loss": 2.0307, "step": 371000 }, { "epoch": 0.38519325414781685, "grad_norm": 1.383955955505371, "learning_rate": 5e-05, "loss": 2.0294, "step": 372000 }, { "epoch": 0.38622871988477336, "grad_norm": 1.489496111869812, "learning_rate": 5e-05, "loss": 2.0232, "step": 373000 }, { "epoch": 0.3872641856217299, "grad_norm": 1.5222963094711304, "learning_rate": 5e-05, "loss": 2.0066, "step": 374000 }, { "epoch": 0.3882996513586864, "grad_norm": 1.6142817735671997, "learning_rate": 5e-05, "loss": 2.006, "step": 375000 }, { "epoch": 0.3882996513586864, "eval_loss": 2.0057575702667236, "eval_runtime": 20.1185, "eval_samples_per_second": 2458.285, "eval_steps_per_second": 9.643, "step": 375000 }, { "epoch": 0.3893351170956429, "grad_norm": 1.6987862586975098, "learning_rate": 5e-05, "loss": 2.0187, "step": 376000 }, { "epoch": 0.39037058283259934, "grad_norm": 1.5701992511749268, "learning_rate": 5e-05, "loss": 2.0086, "step": 377000 }, { "epoch": 0.39140604856955585, "grad_norm": 1.737766146659851, "learning_rate": 5e-05, "loss": 2.0136, "step": 378000 }, { "epoch": 0.39244151430651236, "grad_norm": 1.5094623565673828, "learning_rate": 5e-05, "loss": 2.0112, "step": 379000 }, { "epoch": 0.39347698004346887, "grad_norm": 1.720401644706726, "learning_rate": 5e-05, "loss": 2.0301, "step": 380000 }, { "epoch": 0.39347698004346887, "eval_loss": 2.005438804626465, "eval_runtime": 21.5272, "eval_samples_per_second": 2297.424, "eval_steps_per_second": 9.012, "step": 380000 }, { "epoch": 0.3945124457804253, "grad_norm": 1.6071189641952515, "learning_rate": 5e-05, "loss": 2.0186, "step": 381000 }, { "epoch": 0.39554791151738183, "grad_norm": 1.3733688592910767, "learning_rate": 5e-05, "loss": 2.0177, "step": 382000 }, { "epoch": 0.39658337725433834, "grad_norm": 1.7012932300567627, "learning_rate": 5e-05, "loss": 2.0125, "step": 383000 }, { "epoch": 0.39761884299129485, "grad_norm": 1.656907320022583, "learning_rate": 5e-05, "loss": 2.0204, "step": 384000 }, { "epoch": 0.39865430872825136, "grad_norm": 1.859150767326355, "learning_rate": 5e-05, "loss": 2.0105, "step": 385000 }, { "epoch": 0.39865430872825136, "eval_loss": 2.0000715255737305, "eval_runtime": 20.4022, "eval_samples_per_second": 2424.105, "eval_steps_per_second": 9.509, "step": 385000 }, { "epoch": 0.3996897744652078, "grad_norm": 1.4494318962097168, "learning_rate": 5e-05, "loss": 2.0109, "step": 386000 }, { "epoch": 0.4007252402021643, "grad_norm": 1.7626131772994995, "learning_rate": 5e-05, "loss": 2.0249, "step": 387000 }, { "epoch": 0.40176070593912083, "grad_norm": 1.2615996599197388, "learning_rate": 5e-05, "loss": 2.0044, "step": 388000 }, { "epoch": 0.40279617167607734, "grad_norm": 2.000866174697876, "learning_rate": 5e-05, "loss": 1.9996, "step": 389000 }, { "epoch": 0.4038316374130338, "grad_norm": 1.5822980403900146, "learning_rate": 5e-05, "loss": 2.0113, "step": 390000 }, { "epoch": 0.4038316374130338, "eval_loss": 2.0007712841033936, "eval_runtime": 19.8198, "eval_samples_per_second": 2495.337, "eval_steps_per_second": 9.788, "step": 390000 }, { "epoch": 0.4048671031499903, "grad_norm": 1.8089967966079712, "learning_rate": 5e-05, "loss": 2.0017, "step": 391000 }, { "epoch": 0.4059025688869468, "grad_norm": 2.0939323902130127, "learning_rate": 5e-05, "loss": 2.017, "step": 392000 }, { "epoch": 0.4069380346239033, "grad_norm": 1.581272840499878, "learning_rate": 5e-05, "loss": 2.0061, "step": 393000 }, { "epoch": 0.40797350036085983, "grad_norm": 1.3183523416519165, "learning_rate": 5e-05, "loss": 2.0044, "step": 394000 }, { "epoch": 0.4090089660978163, "grad_norm": 1.9399663209915161, "learning_rate": 5e-05, "loss": 2.0, "step": 395000 }, { "epoch": 0.4090089660978163, "eval_loss": 1.9924626350402832, "eval_runtime": 20.3614, "eval_samples_per_second": 2428.953, "eval_steps_per_second": 9.528, "step": 395000 }, { "epoch": 0.4100444318347728, "grad_norm": 1.667758584022522, "learning_rate": 5e-05, "loss": 2.0093, "step": 396000 }, { "epoch": 0.4110798975717293, "grad_norm": 2.133141040802002, "learning_rate": 5e-05, "loss": 2.0159, "step": 397000 }, { "epoch": 0.4121153633086858, "grad_norm": 1.440760612487793, "learning_rate": 5e-05, "loss": 2.0047, "step": 398000 }, { "epoch": 0.4131508290456423, "grad_norm": 1.7397841215133667, "learning_rate": 5e-05, "loss": 1.9918, "step": 399000 }, { "epoch": 0.4141862947825988, "grad_norm": 1.8425333499908447, "learning_rate": 5e-05, "loss": 2.0072, "step": 400000 }, { "epoch": 0.4141862947825988, "eval_loss": 1.9991180896759033, "eval_runtime": 20.8264, "eval_samples_per_second": 2374.73, "eval_steps_per_second": 9.315, "step": 400000 }, { "epoch": 0.4152217605195553, "grad_norm": 1.6759531497955322, "learning_rate": 5e-05, "loss": 2.0059, "step": 401000 }, { "epoch": 0.4162572262565118, "grad_norm": 1.6831870079040527, "learning_rate": 5e-05, "loss": 1.9958, "step": 402000 }, { "epoch": 0.4172926919934683, "grad_norm": 1.5509451627731323, "learning_rate": 5e-05, "loss": 2.0137, "step": 403000 }, { "epoch": 0.41832815773042475, "grad_norm": 1.3839584589004517, "learning_rate": 5e-05, "loss": 1.9961, "step": 404000 }, { "epoch": 0.41936362346738126, "grad_norm": 1.8058265447616577, "learning_rate": 5e-05, "loss": 1.9999, "step": 405000 }, { "epoch": 0.41936362346738126, "eval_loss": 1.9969453811645508, "eval_runtime": 20.4837, "eval_samples_per_second": 2414.454, "eval_steps_per_second": 9.471, "step": 405000 }, { "epoch": 0.4203990892043378, "grad_norm": 2.366974353790283, "learning_rate": 5e-05, "loss": 2.0044, "step": 406000 }, { "epoch": 0.4214345549412943, "grad_norm": 1.7304997444152832, "learning_rate": 5e-05, "loss": 2.0082, "step": 407000 }, { "epoch": 0.4224700206782508, "grad_norm": 1.6823433637619019, "learning_rate": 5e-05, "loss": 2.015, "step": 408000 }, { "epoch": 0.42350548641520724, "grad_norm": 1.8099905252456665, "learning_rate": 5e-05, "loss": 2.0005, "step": 409000 }, { "epoch": 0.42454095215216375, "grad_norm": 1.789655089378357, "learning_rate": 5e-05, "loss": 1.986, "step": 410000 }, { "epoch": 0.42454095215216375, "eval_loss": 1.9884740114212036, "eval_runtime": 20.7019, "eval_samples_per_second": 2389.009, "eval_steps_per_second": 9.371, "step": 410000 }, { "epoch": 0.42557641788912026, "grad_norm": 1.3981925249099731, "learning_rate": 5e-05, "loss": 2.0071, "step": 411000 }, { "epoch": 0.42661188362607677, "grad_norm": 1.5761098861694336, "learning_rate": 5e-05, "loss": 1.9996, "step": 412000 }, { "epoch": 0.4276473493630332, "grad_norm": 1.5765936374664307, "learning_rate": 5e-05, "loss": 1.9979, "step": 413000 }, { "epoch": 0.42868281509998973, "grad_norm": 1.5242362022399902, "learning_rate": 5e-05, "loss": 1.9872, "step": 414000 }, { "epoch": 0.42971828083694624, "grad_norm": 1.7076730728149414, "learning_rate": 5e-05, "loss": 2.003, "step": 415000 }, { "epoch": 0.42971828083694624, "eval_loss": 1.9865856170654297, "eval_runtime": 19.7883, "eval_samples_per_second": 2499.301, "eval_steps_per_second": 9.804, "step": 415000 }, { "epoch": 0.43075374657390275, "grad_norm": 1.5453383922576904, "learning_rate": 5e-05, "loss": 2.0, "step": 416000 }, { "epoch": 0.43178921231085926, "grad_norm": 1.9326260089874268, "learning_rate": 5e-05, "loss": 2.0007, "step": 417000 }, { "epoch": 0.4328246780478157, "grad_norm": 1.6275213956832886, "learning_rate": 5e-05, "loss": 1.99, "step": 418000 }, { "epoch": 0.4338601437847722, "grad_norm": 1.4359986782073975, "learning_rate": 5e-05, "loss": 1.9895, "step": 419000 }, { "epoch": 0.43489560952172873, "grad_norm": 1.7742505073547363, "learning_rate": 5e-05, "loss": 2.0013, "step": 420000 }, { "epoch": 0.43489560952172873, "eval_loss": 1.9821418523788452, "eval_runtime": 20.3362, "eval_samples_per_second": 2431.974, "eval_steps_per_second": 9.54, "step": 420000 }, { "epoch": 0.43593107525868524, "grad_norm": 1.7934837341308594, "learning_rate": 5e-05, "loss": 1.9856, "step": 421000 }, { "epoch": 0.43696654099564175, "grad_norm": 1.4223476648330688, "learning_rate": 5e-05, "loss": 1.9922, "step": 422000 }, { "epoch": 0.4380020067325982, "grad_norm": 1.4957746267318726, "learning_rate": 5e-05, "loss": 1.9842, "step": 423000 }, { "epoch": 0.4390374724695547, "grad_norm": 1.7151662111282349, "learning_rate": 5e-05, "loss": 1.9929, "step": 424000 }, { "epoch": 0.4400729382065112, "grad_norm": 1.6285368204116821, "learning_rate": 5e-05, "loss": 1.9955, "step": 425000 }, { "epoch": 0.4400729382065112, "eval_loss": 1.9809545278549194, "eval_runtime": 20.6817, "eval_samples_per_second": 2391.337, "eval_steps_per_second": 9.38, "step": 425000 }, { "epoch": 0.44110840394346773, "grad_norm": 1.7479044198989868, "learning_rate": 5e-05, "loss": 1.9876, "step": 426000 }, { "epoch": 0.4421438696804242, "grad_norm": 1.6902720928192139, "learning_rate": 5e-05, "loss": 1.9939, "step": 427000 }, { "epoch": 0.4431793354173807, "grad_norm": 1.5723377466201782, "learning_rate": 5e-05, "loss": 1.9845, "step": 428000 }, { "epoch": 0.4442148011543372, "grad_norm": 1.6776286363601685, "learning_rate": 5e-05, "loss": 1.9852, "step": 429000 }, { "epoch": 0.4452502668912937, "grad_norm": 2.08964204788208, "learning_rate": 5e-05, "loss": 1.997, "step": 430000 }, { "epoch": 0.4452502668912937, "eval_loss": 1.9794005155563354, "eval_runtime": 21.8926, "eval_samples_per_second": 2259.078, "eval_steps_per_second": 8.861, "step": 430000 }, { "epoch": 0.4462857326282502, "grad_norm": 1.450403094291687, "learning_rate": 5e-05, "loss": 1.9929, "step": 431000 }, { "epoch": 0.4473211983652067, "grad_norm": 1.9465093612670898, "learning_rate": 5e-05, "loss": 1.982, "step": 432000 }, { "epoch": 0.4483566641021632, "grad_norm": 1.9567445516586304, "learning_rate": 5e-05, "loss": 1.9865, "step": 433000 }, { "epoch": 0.4493921298391197, "grad_norm": 1.8487781286239624, "learning_rate": 5e-05, "loss": 1.9861, "step": 434000 }, { "epoch": 0.4504275955760762, "grad_norm": 1.77712082862854, "learning_rate": 5e-05, "loss": 1.9831, "step": 435000 }, { "epoch": 0.4504275955760762, "eval_loss": 1.9783129692077637, "eval_runtime": 20.7598, "eval_samples_per_second": 2382.35, "eval_steps_per_second": 9.345, "step": 435000 }, { "epoch": 0.4514630613130327, "grad_norm": 1.4690871238708496, "learning_rate": 5e-05, "loss": 1.984, "step": 436000 }, { "epoch": 0.45249852704998916, "grad_norm": 1.974969506263733, "learning_rate": 5e-05, "loss": 1.9857, "step": 437000 }, { "epoch": 0.45353399278694567, "grad_norm": 2.069812774658203, "learning_rate": 5e-05, "loss": 1.9851, "step": 438000 }, { "epoch": 0.4545694585239022, "grad_norm": 1.603868842124939, "learning_rate": 5e-05, "loss": 1.9768, "step": 439000 }, { "epoch": 0.4556049242608587, "grad_norm": 1.8147460222244263, "learning_rate": 5e-05, "loss": 1.9948, "step": 440000 }, { "epoch": 0.4556049242608587, "eval_loss": 1.9627678394317627, "eval_runtime": 20.2676, "eval_samples_per_second": 2440.206, "eval_steps_per_second": 9.572, "step": 440000 }, { "epoch": 0.45664038999781514, "grad_norm": 1.8408571481704712, "learning_rate": 5e-05, "loss": 1.9941, "step": 441000 }, { "epoch": 0.45767585573477165, "grad_norm": 1.523547649383545, "learning_rate": 5e-05, "loss": 1.9889, "step": 442000 }, { "epoch": 0.45871132147172816, "grad_norm": 1.6914969682693481, "learning_rate": 5e-05, "loss": 1.9828, "step": 443000 }, { "epoch": 0.45974678720868467, "grad_norm": 1.5032548904418945, "learning_rate": 5e-05, "loss": 1.9863, "step": 444000 }, { "epoch": 0.4607822529456412, "grad_norm": 1.5079525709152222, "learning_rate": 5e-05, "loss": 1.9828, "step": 445000 }, { "epoch": 0.4607822529456412, "eval_loss": 1.9710944890975952, "eval_runtime": 20.9498, "eval_samples_per_second": 2360.739, "eval_steps_per_second": 9.26, "step": 445000 }, { "epoch": 0.46181771868259763, "grad_norm": 1.5467554330825806, "learning_rate": 5e-05, "loss": 1.9786, "step": 446000 }, { "epoch": 0.46285318441955414, "grad_norm": 1.4302737712860107, "learning_rate": 5e-05, "loss": 1.988, "step": 447000 }, { "epoch": 0.46388865015651065, "grad_norm": 1.6843082904815674, "learning_rate": 5e-05, "loss": 1.9729, "step": 448000 }, { "epoch": 0.46492411589346716, "grad_norm": 1.6686064004898071, "learning_rate": 5e-05, "loss": 1.9856, "step": 449000 }, { "epoch": 0.4659595816304236, "grad_norm": 1.7648873329162598, "learning_rate": 5e-05, "loss": 1.977, "step": 450000 }, { "epoch": 0.4659595816304236, "eval_loss": 1.9715555906295776, "eval_runtime": 20.5454, "eval_samples_per_second": 2407.206, "eval_steps_per_second": 9.443, "step": 450000 }, { "epoch": 0.4669950473673801, "grad_norm": 1.4987176656723022, "learning_rate": 5e-05, "loss": 1.9777, "step": 451000 }, { "epoch": 0.46803051310433663, "grad_norm": 1.6089203357696533, "learning_rate": 5e-05, "loss": 1.9803, "step": 452000 }, { "epoch": 0.46906597884129314, "grad_norm": 1.430272102355957, "learning_rate": 5e-05, "loss": 1.9824, "step": 453000 }, { "epoch": 0.47010144457824965, "grad_norm": 1.4526315927505493, "learning_rate": 5e-05, "loss": 1.9811, "step": 454000 }, { "epoch": 0.4711369103152061, "grad_norm": 1.6255720853805542, "learning_rate": 5e-05, "loss": 1.9749, "step": 455000 }, { "epoch": 0.4711369103152061, "eval_loss": 1.972861409187317, "eval_runtime": 21.2643, "eval_samples_per_second": 2325.821, "eval_steps_per_second": 9.123, "step": 455000 }, { "epoch": 0.4721723760521626, "grad_norm": 1.470329761505127, "learning_rate": 5e-05, "loss": 1.9825, "step": 456000 }, { "epoch": 0.4732078417891191, "grad_norm": 1.5077927112579346, "learning_rate": 5e-05, "loss": 1.9826, "step": 457000 }, { "epoch": 0.47424330752607563, "grad_norm": 1.5065680742263794, "learning_rate": 5e-05, "loss": 1.9794, "step": 458000 }, { "epoch": 0.47527877326303214, "grad_norm": 1.6117650270462036, "learning_rate": 5e-05, "loss": 1.9769, "step": 459000 }, { "epoch": 0.4763142389999886, "grad_norm": 1.5135180950164795, "learning_rate": 5e-05, "loss": 1.9709, "step": 460000 }, { "epoch": 0.4763142389999886, "eval_loss": 1.9691742658615112, "eval_runtime": 21.5566, "eval_samples_per_second": 2294.281, "eval_steps_per_second": 9.0, "step": 460000 }, { "epoch": 0.4773497047369451, "grad_norm": 1.9140368700027466, "learning_rate": 5e-05, "loss": 1.9724, "step": 461000 }, { "epoch": 0.4783851704739016, "grad_norm": 1.59769606590271, "learning_rate": 5e-05, "loss": 1.9773, "step": 462000 }, { "epoch": 0.4794206362108581, "grad_norm": 1.7482682466506958, "learning_rate": 5e-05, "loss": 1.9668, "step": 463000 }, { "epoch": 0.4804561019478146, "grad_norm": 1.3569929599761963, "learning_rate": 5e-05, "loss": 1.9759, "step": 464000 }, { "epoch": 0.4814915676847711, "grad_norm": 1.793421745300293, "learning_rate": 5e-05, "loss": 1.9803, "step": 465000 }, { "epoch": 0.4814915676847711, "eval_loss": 1.965020775794983, "eval_runtime": 20.5866, "eval_samples_per_second": 2402.394, "eval_steps_per_second": 9.424, "step": 465000 }, { "epoch": 0.4825270334217276, "grad_norm": 1.6482970714569092, "learning_rate": 5e-05, "loss": 1.9751, "step": 466000 }, { "epoch": 0.4835624991586841, "grad_norm": 1.9341484308242798, "learning_rate": 5e-05, "loss": 1.9804, "step": 467000 }, { "epoch": 0.4845979648956406, "grad_norm": 1.7689942121505737, "learning_rate": 5e-05, "loss": 1.9709, "step": 468000 }, { "epoch": 0.48563343063259706, "grad_norm": 1.5617495775222778, "learning_rate": 5e-05, "loss": 1.9817, "step": 469000 }, { "epoch": 0.48666889636955357, "grad_norm": 1.3667575120925903, "learning_rate": 5e-05, "loss": 1.9678, "step": 470000 }, { "epoch": 0.48666889636955357, "eval_loss": 1.9600496292114258, "eval_runtime": 20.8747, "eval_samples_per_second": 2369.229, "eval_steps_per_second": 9.294, "step": 470000 }, { "epoch": 0.4877043621065101, "grad_norm": 1.5237644910812378, "learning_rate": 5e-05, "loss": 1.9686, "step": 471000 }, { "epoch": 0.4887398278434666, "grad_norm": 1.3905088901519775, "learning_rate": 5e-05, "loss": 1.9668, "step": 472000 }, { "epoch": 0.48977529358042304, "grad_norm": 1.2724034786224365, "learning_rate": 5e-05, "loss": 1.986, "step": 473000 }, { "epoch": 0.49081075931737955, "grad_norm": 1.9573471546173096, "learning_rate": 5e-05, "loss": 1.9712, "step": 474000 }, { "epoch": 0.49184622505433606, "grad_norm": 1.5011438131332397, "learning_rate": 5e-05, "loss": 1.972, "step": 475000 }, { "epoch": 0.49184622505433606, "eval_loss": 1.9591975212097168, "eval_runtime": 20.7331, "eval_samples_per_second": 2385.41, "eval_steps_per_second": 9.357, "step": 475000 }, { "epoch": 0.49288169079129257, "grad_norm": 1.5569186210632324, "learning_rate": 5e-05, "loss": 1.9717, "step": 476000 }, { "epoch": 0.4939171565282491, "grad_norm": 1.3786282539367676, "learning_rate": 5e-05, "loss": 1.9665, "step": 477000 }, { "epoch": 0.49495262226520553, "grad_norm": 1.557861328125, "learning_rate": 5e-05, "loss": 1.9757, "step": 478000 }, { "epoch": 0.49598808800216204, "grad_norm": 1.5497270822525024, "learning_rate": 5e-05, "loss": 1.9756, "step": 479000 }, { "epoch": 0.49702355373911855, "grad_norm": 1.4066565036773682, "learning_rate": 5e-05, "loss": 1.9674, "step": 480000 }, { "epoch": 0.49702355373911855, "eval_loss": 1.9571633338928223, "eval_runtime": 20.9157, "eval_samples_per_second": 2364.586, "eval_steps_per_second": 9.275, "step": 480000 }, { "epoch": 0.49805901947607506, "grad_norm": 1.862382173538208, "learning_rate": 5e-05, "loss": 1.9645, "step": 481000 }, { "epoch": 0.49909448521303157, "grad_norm": 1.7556581497192383, "learning_rate": 5e-05, "loss": 1.9779, "step": 482000 }, { "epoch": 0.500129950949988, "grad_norm": 1.5234323740005493, "learning_rate": 5e-05, "loss": 1.9611, "step": 483000 }, { "epoch": 0.5011654166869446, "grad_norm": 1.395575761795044, "learning_rate": 5e-05, "loss": 1.9707, "step": 484000 }, { "epoch": 0.502200882423901, "grad_norm": 1.82569420337677, "learning_rate": 5e-05, "loss": 1.9659, "step": 485000 }, { "epoch": 0.502200882423901, "eval_loss": 1.960773229598999, "eval_runtime": 20.3475, "eval_samples_per_second": 2430.618, "eval_steps_per_second": 9.534, "step": 485000 }, { "epoch": 0.5032363481608575, "grad_norm": 1.3626302480697632, "learning_rate": 5e-05, "loss": 1.9606, "step": 486000 }, { "epoch": 0.5042718138978141, "grad_norm": 1.4180521965026855, "learning_rate": 5e-05, "loss": 1.9689, "step": 487000 }, { "epoch": 0.5053072796347705, "grad_norm": 1.9080036878585815, "learning_rate": 5e-05, "loss": 1.9618, "step": 488000 }, { "epoch": 0.5063427453717271, "grad_norm": 1.3812907934188843, "learning_rate": 5e-05, "loss": 1.9663, "step": 489000 }, { "epoch": 0.5073782111086835, "grad_norm": 1.6814547777175903, "learning_rate": 5e-05, "loss": 1.9612, "step": 490000 }, { "epoch": 0.5073782111086835, "eval_loss": 1.9534902572631836, "eval_runtime": 21.4593, "eval_samples_per_second": 2304.692, "eval_steps_per_second": 9.04, "step": 490000 }, { "epoch": 0.50841367684564, "grad_norm": 1.7079062461853027, "learning_rate": 5e-05, "loss": 1.9644, "step": 491000 }, { "epoch": 0.5094491425825965, "grad_norm": 1.8109638690948486, "learning_rate": 5e-05, "loss": 1.9602, "step": 492000 }, { "epoch": 0.510484608319553, "grad_norm": 1.6124908924102783, "learning_rate": 5e-05, "loss": 1.9582, "step": 493000 }, { "epoch": 0.5115200740565095, "grad_norm": 1.6076602935791016, "learning_rate": 5e-05, "loss": 1.9548, "step": 494000 }, { "epoch": 0.512555539793466, "grad_norm": 1.3506944179534912, "learning_rate": 5e-05, "loss": 1.9571, "step": 495000 }, { "epoch": 0.512555539793466, "eval_loss": 1.9505630731582642, "eval_runtime": 20.7858, "eval_samples_per_second": 2379.366, "eval_steps_per_second": 9.333, "step": 495000 }, { "epoch": 0.5135910055304225, "grad_norm": 1.4965591430664062, "learning_rate": 5e-05, "loss": 1.9625, "step": 496000 }, { "epoch": 0.514626471267379, "grad_norm": 3.6810128688812256, "learning_rate": 5e-05, "loss": 1.9616, "step": 497000 }, { "epoch": 0.5156619370043355, "grad_norm": 1.7980040311813354, "learning_rate": 5e-05, "loss": 1.9685, "step": 498000 }, { "epoch": 0.516697402741292, "grad_norm": 2.078582286834717, "learning_rate": 5e-05, "loss": 1.9583, "step": 499000 }, { "epoch": 0.5177328684782485, "grad_norm": 1.7889291048049927, "learning_rate": 5e-05, "loss": 1.9468, "step": 500000 }, { "epoch": 0.5177328684782485, "eval_loss": 1.9439510107040405, "eval_runtime": 20.9843, "eval_samples_per_second": 2356.859, "eval_steps_per_second": 9.245, "step": 500000 }, { "epoch": 0.518768334215205, "grad_norm": 1.7821800708770752, "learning_rate": 5e-05, "loss": 1.9471, "step": 501000 }, { "epoch": 0.5198037999521615, "grad_norm": 1.6444828510284424, "learning_rate": 5e-05, "loss": 1.957, "step": 502000 }, { "epoch": 0.520839265689118, "grad_norm": 1.563770055770874, "learning_rate": 5e-05, "loss": 1.96, "step": 503000 }, { "epoch": 0.5218747314260744, "grad_norm": 1.7522222995758057, "learning_rate": 5e-05, "loss": 1.9566, "step": 504000 }, { "epoch": 0.522910197163031, "grad_norm": 1.9874364137649536, "learning_rate": 5e-05, "loss": 1.95, "step": 505000 }, { "epoch": 0.522910197163031, "eval_loss": 1.9490445852279663, "eval_runtime": 20.4006, "eval_samples_per_second": 2424.286, "eval_steps_per_second": 9.51, "step": 505000 }, { "epoch": 0.5239456628999875, "grad_norm": 1.693300724029541, "learning_rate": 5e-05, "loss": 1.9627, "step": 506000 }, { "epoch": 0.524981128636944, "grad_norm": 1.4723318815231323, "learning_rate": 5e-05, "loss": 1.9523, "step": 507000 }, { "epoch": 0.5260165943739005, "grad_norm": 1.6969449520111084, "learning_rate": 5e-05, "loss": 1.9583, "step": 508000 }, { "epoch": 0.5270520601108569, "grad_norm": 2.029672861099243, "learning_rate": 5e-05, "loss": 1.9561, "step": 509000 }, { "epoch": 0.5280875258478135, "grad_norm": 1.733914852142334, "learning_rate": 5e-05, "loss": 1.9458, "step": 510000 }, { "epoch": 0.5280875258478135, "eval_loss": 1.947721004486084, "eval_runtime": 20.9868, "eval_samples_per_second": 2356.582, "eval_steps_per_second": 9.244, "step": 510000 }, { "epoch": 0.5291229915847699, "grad_norm": 1.4405544996261597, "learning_rate": 5e-05, "loss": 1.9605, "step": 511000 }, { "epoch": 0.5301584573217265, "grad_norm": 1.559751272201538, "learning_rate": 5e-05, "loss": 1.9635, "step": 512000 }, { "epoch": 0.531193923058683, "grad_norm": 1.570190191268921, "learning_rate": 5e-05, "loss": 1.9599, "step": 513000 }, { "epoch": 0.5322293887956394, "grad_norm": 1.7898485660552979, "learning_rate": 5e-05, "loss": 1.9665, "step": 514000 }, { "epoch": 0.533264854532596, "grad_norm": 1.6552945375442505, "learning_rate": 5e-05, "loss": 1.9477, "step": 515000 }, { "epoch": 0.533264854532596, "eval_loss": 1.9498939514160156, "eval_runtime": 20.4739, "eval_samples_per_second": 2415.616, "eval_steps_per_second": 9.475, "step": 515000 }, { "epoch": 0.5343003202695524, "grad_norm": 1.383664608001709, "learning_rate": 5e-05, "loss": 1.9655, "step": 516000 }, { "epoch": 0.5353357860065089, "grad_norm": 1.8790479898452759, "learning_rate": 5e-05, "loss": 1.9548, "step": 517000 }, { "epoch": 0.5363712517434654, "grad_norm": 2.0052480697631836, "learning_rate": 5e-05, "loss": 1.9499, "step": 518000 }, { "epoch": 0.5374067174804219, "grad_norm": 1.55815589427948, "learning_rate": 5e-05, "loss": 1.9489, "step": 519000 }, { "epoch": 0.5384421832173785, "grad_norm": 1.6805282831192017, "learning_rate": 5e-05, "loss": 1.9517, "step": 520000 }, { "epoch": 0.5384421832173785, "eval_loss": 1.9381487369537354, "eval_runtime": 21.4027, "eval_samples_per_second": 2310.779, "eval_steps_per_second": 9.064, "step": 520000 }, { "epoch": 0.5394776489543349, "grad_norm": 1.5910086631774902, "learning_rate": 5e-05, "loss": 1.9535, "step": 521000 }, { "epoch": 0.5405131146912914, "grad_norm": 1.8047144412994385, "learning_rate": 5e-05, "loss": 1.9648, "step": 522000 }, { "epoch": 0.5415485804282479, "grad_norm": 2.062201976776123, "learning_rate": 5e-05, "loss": 1.9476, "step": 523000 }, { "epoch": 0.5425840461652044, "grad_norm": 1.7508527040481567, "learning_rate": 5e-05, "loss": 1.9569, "step": 524000 }, { "epoch": 0.543619511902161, "grad_norm": 1.8220138549804688, "learning_rate": 5e-05, "loss": 1.9438, "step": 525000 }, { "epoch": 0.543619511902161, "eval_loss": 1.9415658712387085, "eval_runtime": 20.3248, "eval_samples_per_second": 2433.336, "eval_steps_per_second": 9.545, "step": 525000 }, { "epoch": 0.5446549776391174, "grad_norm": 1.4566571712493896, "learning_rate": 5e-05, "loss": 1.9451, "step": 526000 }, { "epoch": 0.5456904433760739, "grad_norm": 1.66045081615448, "learning_rate": 5e-05, "loss": 1.947, "step": 527000 }, { "epoch": 0.5467259091130304, "grad_norm": 1.6011165380477905, "learning_rate": 5e-05, "loss": 1.9504, "step": 528000 }, { "epoch": 0.5477613748499869, "grad_norm": 1.4785083532333374, "learning_rate": 5e-05, "loss": 1.9418, "step": 529000 }, { "epoch": 0.5487968405869434, "grad_norm": 1.584233045578003, "learning_rate": 5e-05, "loss": 1.9519, "step": 530000 }, { "epoch": 0.5487968405869434, "eval_loss": 1.9392527341842651, "eval_runtime": 20.2285, "eval_samples_per_second": 2444.917, "eval_steps_per_second": 9.59, "step": 530000 }, { "epoch": 0.5498323063238999, "grad_norm": 1.7094584703445435, "learning_rate": 5e-05, "loss": 1.9535, "step": 531000 }, { "epoch": 0.5508677720608564, "grad_norm": 1.5422221422195435, "learning_rate": 5e-05, "loss": 1.9615, "step": 532000 }, { "epoch": 0.5519032377978129, "grad_norm": 2.052283525466919, "learning_rate": 5e-05, "loss": 1.9607, "step": 533000 }, { "epoch": 0.5529387035347694, "grad_norm": 1.5253174304962158, "learning_rate": 5e-05, "loss": 1.948, "step": 534000 }, { "epoch": 0.5539741692717259, "grad_norm": 1.4124022722244263, "learning_rate": 5e-05, "loss": 1.9532, "step": 535000 }, { "epoch": 0.5539741692717259, "eval_loss": 1.9418219327926636, "eval_runtime": 20.3658, "eval_samples_per_second": 2428.43, "eval_steps_per_second": 9.526, "step": 535000 }, { "epoch": 0.5550096350086824, "grad_norm": 1.3456830978393555, "learning_rate": 5e-05, "loss": 1.9475, "step": 536000 }, { "epoch": 0.5560451007456388, "grad_norm": 2.3639848232269287, "learning_rate": 5e-05, "loss": 1.9477, "step": 537000 }, { "epoch": 0.5570805664825954, "grad_norm": 1.6593014001846313, "learning_rate": 5e-05, "loss": 1.9352, "step": 538000 }, { "epoch": 0.5581160322195519, "grad_norm": 1.6242729425430298, "learning_rate": 5e-05, "loss": 1.9353, "step": 539000 }, { "epoch": 0.5591514979565083, "grad_norm": 1.4847745895385742, "learning_rate": 5e-05, "loss": 1.9462, "step": 540000 }, { "epoch": 0.5591514979565083, "eval_loss": 1.9415589570999146, "eval_runtime": 20.4338, "eval_samples_per_second": 2420.357, "eval_steps_per_second": 9.494, "step": 540000 }, { "epoch": 0.5601869636934649, "grad_norm": 1.6553374528884888, "learning_rate": 5e-05, "loss": 1.9401, "step": 541000 }, { "epoch": 0.5612224294304213, "grad_norm": 1.6404752731323242, "learning_rate": 5e-05, "loss": 1.9523, "step": 542000 }, { "epoch": 0.5622578951673779, "grad_norm": 1.7159552574157715, "learning_rate": 5e-05, "loss": 1.9538, "step": 543000 }, { "epoch": 0.5632933609043344, "grad_norm": 1.8530490398406982, "learning_rate": 5e-05, "loss": 1.9471, "step": 544000 }, { "epoch": 0.5643288266412908, "grad_norm": 1.7620545625686646, "learning_rate": 5e-05, "loss": 1.9563, "step": 545000 }, { "epoch": 0.5643288266412908, "eval_loss": 1.9346576929092407, "eval_runtime": 21.3714, "eval_samples_per_second": 2314.163, "eval_steps_per_second": 9.078, "step": 545000 }, { "epoch": 0.5653642923782474, "grad_norm": 1.5687700510025024, "learning_rate": 5e-05, "loss": 1.9464, "step": 546000 }, { "epoch": 0.5663997581152038, "grad_norm": 1.4959015846252441, "learning_rate": 5e-05, "loss": 1.9476, "step": 547000 }, { "epoch": 0.5674352238521604, "grad_norm": 1.7213226556777954, "learning_rate": 5e-05, "loss": 1.9385, "step": 548000 }, { "epoch": 0.5684706895891168, "grad_norm": 1.9467849731445312, "learning_rate": 5e-05, "loss": 1.9537, "step": 549000 }, { "epoch": 0.5695061553260733, "grad_norm": 1.5239862203598022, "learning_rate": 5e-05, "loss": 1.953, "step": 550000 }, { "epoch": 0.5695061553260733, "eval_loss": 1.9318312406539917, "eval_runtime": 20.9119, "eval_samples_per_second": 2365.015, "eval_steps_per_second": 9.277, "step": 550000 }, { "epoch": 0.5705416210630299, "grad_norm": 1.425758957862854, "learning_rate": 5e-05, "loss": 1.936, "step": 551000 }, { "epoch": 0.5715770867999863, "grad_norm": 1.4828753471374512, "learning_rate": 5e-05, "loss": 1.9395, "step": 552000 }, { "epoch": 0.5726125525369429, "grad_norm": 1.7051657438278198, "learning_rate": 5e-05, "loss": 1.9559, "step": 553000 }, { "epoch": 0.5736480182738993, "grad_norm": 1.7012929916381836, "learning_rate": 5e-05, "loss": 1.9345, "step": 554000 }, { "epoch": 0.5746834840108558, "grad_norm": 1.9521725177764893, "learning_rate": 5e-05, "loss": 1.9387, "step": 555000 }, { "epoch": 0.5746834840108558, "eval_loss": 1.9329185485839844, "eval_runtime": 22.9876, "eval_samples_per_second": 2151.465, "eval_steps_per_second": 8.439, "step": 555000 }, { "epoch": 0.5757189497478123, "grad_norm": 1.610854148864746, "learning_rate": 5e-05, "loss": 1.9437, "step": 556000 }, { "epoch": 0.5767544154847688, "grad_norm": 1.6509226560592651, "learning_rate": 5e-05, "loss": 1.9313, "step": 557000 }, { "epoch": 0.5777898812217254, "grad_norm": 1.6432125568389893, "learning_rate": 5e-05, "loss": 1.9341, "step": 558000 }, { "epoch": 0.5788253469586818, "grad_norm": 1.82942533493042, "learning_rate": 5e-05, "loss": 1.9427, "step": 559000 }, { "epoch": 0.5798608126956383, "grad_norm": 1.574141263961792, "learning_rate": 5e-05, "loss": 1.9342, "step": 560000 }, { "epoch": 0.5798608126956383, "eval_loss": 1.9234023094177246, "eval_runtime": 20.6847, "eval_samples_per_second": 2390.993, "eval_steps_per_second": 9.379, "step": 560000 }, { "epoch": 0.5808962784325948, "grad_norm": 1.1034334897994995, "learning_rate": 5e-05, "loss": 1.9385, "step": 561000 }, { "epoch": 0.5819317441695513, "grad_norm": 1.800497055053711, "learning_rate": 5e-05, "loss": 1.945, "step": 562000 }, { "epoch": 0.5829672099065077, "grad_norm": 1.6229628324508667, "learning_rate": 5e-05, "loss": 1.9357, "step": 563000 }, { "epoch": 0.5840026756434643, "grad_norm": 1.500807762145996, "learning_rate": 5e-05, "loss": 1.9461, "step": 564000 }, { "epoch": 0.5850381413804208, "grad_norm": 1.3186248540878296, "learning_rate": 5e-05, "loss": 1.9359, "step": 565000 }, { "epoch": 0.5850381413804208, "eval_loss": 1.9301375150680542, "eval_runtime": 21.6317, "eval_samples_per_second": 2286.315, "eval_steps_per_second": 8.968, "step": 565000 }, { "epoch": 0.5860736071173773, "grad_norm": 1.6370793581008911, "learning_rate": 5e-05, "loss": 1.9336, "step": 566000 }, { "epoch": 0.5871090728543338, "grad_norm": 1.455642819404602, "learning_rate": 5e-05, "loss": 1.935, "step": 567000 }, { "epoch": 0.5881445385912902, "grad_norm": 1.7208240032196045, "learning_rate": 5e-05, "loss": 1.9391, "step": 568000 }, { "epoch": 0.5891800043282468, "grad_norm": 1.6520531177520752, "learning_rate": 5e-05, "loss": 1.9313, "step": 569000 }, { "epoch": 0.5902154700652033, "grad_norm": 1.641925573348999, "learning_rate": 5e-05, "loss": 1.941, "step": 570000 }, { "epoch": 0.5902154700652033, "eval_loss": 1.924635887145996, "eval_runtime": 20.8475, "eval_samples_per_second": 2372.317, "eval_steps_per_second": 9.306, "step": 570000 }, { "epoch": 0.5912509358021598, "grad_norm": 1.756401538848877, "learning_rate": 5e-05, "loss": 1.9198, "step": 571000 }, { "epoch": 0.5922864015391163, "grad_norm": 1.855057716369629, "learning_rate": 5e-05, "loss": 1.9265, "step": 572000 }, { "epoch": 0.5933218672760727, "grad_norm": 1.4401320219039917, "learning_rate": 5e-05, "loss": 1.9253, "step": 573000 }, { "epoch": 0.5943573330130293, "grad_norm": 1.8701120615005493, "learning_rate": 5e-05, "loss": 1.9371, "step": 574000 }, { "epoch": 0.5953927987499857, "grad_norm": 1.3622015714645386, "learning_rate": 5e-05, "loss": 1.9255, "step": 575000 }, { "epoch": 0.5953927987499857, "eval_loss": 1.9278358221054077, "eval_runtime": 20.619, "eval_samples_per_second": 2398.61, "eval_steps_per_second": 9.409, "step": 575000 }, { "epoch": 0.5964282644869423, "grad_norm": 1.423957109451294, "learning_rate": 5e-05, "loss": 1.9304, "step": 576000 }, { "epoch": 0.5974637302238988, "grad_norm": 1.4679838418960571, "learning_rate": 5e-05, "loss": 1.9356, "step": 577000 }, { "epoch": 0.5984991959608552, "grad_norm": 1.7475810050964355, "learning_rate": 5e-05, "loss": 1.9383, "step": 578000 }, { "epoch": 0.5995346616978118, "grad_norm": 1.4328486919403076, "learning_rate": 5e-05, "loss": 1.9211, "step": 579000 }, { "epoch": 0.6005701274347682, "grad_norm": 1.367355465888977, "learning_rate": 5e-05, "loss": 1.928, "step": 580000 }, { "epoch": 0.6005701274347682, "eval_loss": 1.926809310913086, "eval_runtime": 21.2573, "eval_samples_per_second": 2326.589, "eval_steps_per_second": 9.126, "step": 580000 }, { "epoch": 0.6016055931717248, "grad_norm": 1.7061880826950073, "learning_rate": 5e-05, "loss": 1.927, "step": 581000 }, { "epoch": 0.6026410589086812, "grad_norm": 1.2841322422027588, "learning_rate": 5e-05, "loss": 1.9244, "step": 582000 }, { "epoch": 0.6036765246456377, "grad_norm": 1.597072958946228, "learning_rate": 5e-05, "loss": 1.9297, "step": 583000 }, { "epoch": 0.6047119903825943, "grad_norm": 1.5782889127731323, "learning_rate": 5e-05, "loss": 1.9293, "step": 584000 }, { "epoch": 0.6057474561195507, "grad_norm": 1.709469199180603, "learning_rate": 5e-05, "loss": 1.9275, "step": 585000 }, { "epoch": 0.6057474561195507, "eval_loss": 1.9254355430603027, "eval_runtime": 20.5016, "eval_samples_per_second": 2412.346, "eval_steps_per_second": 9.463, "step": 585000 }, { "epoch": 0.6067829218565072, "grad_norm": 1.491125226020813, "learning_rate": 5e-05, "loss": 1.9303, "step": 586000 }, { "epoch": 0.6078183875934637, "grad_norm": 1.8089282512664795, "learning_rate": 5e-05, "loss": 1.9119, "step": 587000 }, { "epoch": 0.6088538533304202, "grad_norm": 1.6499803066253662, "learning_rate": 5e-05, "loss": 1.9297, "step": 588000 }, { "epoch": 0.6098893190673768, "grad_norm": 1.563252568244934, "learning_rate": 5e-05, "loss": 1.9225, "step": 589000 }, { "epoch": 0.6109247848043332, "grad_norm": 1.752363681793213, "learning_rate": 5e-05, "loss": 1.9248, "step": 590000 }, { "epoch": 0.6109247848043332, "eval_loss": 1.9221693277359009, "eval_runtime": 21.0482, "eval_samples_per_second": 2349.701, "eval_steps_per_second": 9.217, "step": 590000 }, { "epoch": 0.6119602505412897, "grad_norm": 1.770412802696228, "learning_rate": 5e-05, "loss": 1.9174, "step": 591000 }, { "epoch": 0.6129957162782462, "grad_norm": 1.7694320678710938, "learning_rate": 5e-05, "loss": 1.932, "step": 592000 }, { "epoch": 0.6140311820152027, "grad_norm": 1.4580289125442505, "learning_rate": 5e-05, "loss": 1.935, "step": 593000 }, { "epoch": 0.6150666477521592, "grad_norm": 1.39983332157135, "learning_rate": 5e-05, "loss": 1.9131, "step": 594000 }, { "epoch": 0.6161021134891157, "grad_norm": 1.8075814247131348, "learning_rate": 5e-05, "loss": 1.9265, "step": 595000 }, { "epoch": 0.6161021134891157, "eval_loss": 1.9219050407409668, "eval_runtime": 20.8447, "eval_samples_per_second": 2372.645, "eval_steps_per_second": 9.307, "step": 595000 }, { "epoch": 0.6171375792260722, "grad_norm": 1.6267366409301758, "learning_rate": 5e-05, "loss": 1.9316, "step": 596000 }, { "epoch": 0.6181730449630287, "grad_norm": 1.6033700704574585, "learning_rate": 5e-05, "loss": 1.92, "step": 597000 }, { "epoch": 0.6192085106999852, "grad_norm": 1.993261456489563, "learning_rate": 5e-05, "loss": 1.9246, "step": 598000 }, { "epoch": 0.6202439764369417, "grad_norm": 1.785793423652649, "learning_rate": 5e-05, "loss": 1.9264, "step": 599000 }, { "epoch": 0.6212794421738982, "grad_norm": 1.7126010656356812, "learning_rate": 5e-05, "loss": 1.9228, "step": 600000 }, { "epoch": 0.6212794421738982, "eval_loss": 1.910009741783142, "eval_runtime": 21.3491, "eval_samples_per_second": 2316.58, "eval_steps_per_second": 9.087, "step": 600000 }, { "epoch": 0.6223149079108546, "grad_norm": 1.6621829271316528, "learning_rate": 5e-05, "loss": 1.9381, "step": 601000 }, { "epoch": 0.6233503736478112, "grad_norm": 2.010495185852051, "learning_rate": 5e-05, "loss": 1.9177, "step": 602000 }, { "epoch": 0.6243858393847677, "grad_norm": 1.4604156017303467, "learning_rate": 5e-05, "loss": 1.9163, "step": 603000 }, { "epoch": 0.6254213051217242, "grad_norm": 1.5160646438598633, "learning_rate": 5e-05, "loss": 1.9309, "step": 604000 }, { "epoch": 0.6264567708586807, "grad_norm": 1.889029860496521, "learning_rate": 5e-05, "loss": 1.9245, "step": 605000 }, { "epoch": 0.6264567708586807, "eval_loss": 1.9162534475326538, "eval_runtime": 20.621, "eval_samples_per_second": 2398.382, "eval_steps_per_second": 9.408, "step": 605000 }, { "epoch": 0.6274922365956371, "grad_norm": 1.5529805421829224, "learning_rate": 5e-05, "loss": 1.9274, "step": 606000 }, { "epoch": 0.6285277023325937, "grad_norm": 1.7846838235855103, "learning_rate": 5e-05, "loss": 1.9304, "step": 607000 }, { "epoch": 0.6295631680695501, "grad_norm": 1.9997135400772095, "learning_rate": 5e-05, "loss": 1.9235, "step": 608000 }, { "epoch": 0.6305986338065067, "grad_norm": 1.391270637512207, "learning_rate": 5e-05, "loss": 1.9193, "step": 609000 }, { "epoch": 0.6316340995434632, "grad_norm": 1.8219945430755615, "learning_rate": 5e-05, "loss": 1.9121, "step": 610000 }, { "epoch": 0.6316340995434632, "eval_loss": 1.9175958633422852, "eval_runtime": 21.497, "eval_samples_per_second": 2300.643, "eval_steps_per_second": 9.025, "step": 610000 }, { "epoch": 0.6326695652804196, "grad_norm": 1.5030641555786133, "learning_rate": 5e-05, "loss": 1.9238, "step": 611000 }, { "epoch": 0.6337050310173762, "grad_norm": 1.6022168397903442, "learning_rate": 5e-05, "loss": 1.9356, "step": 612000 }, { "epoch": 0.6347404967543326, "grad_norm": 1.4753010272979736, "learning_rate": 5e-05, "loss": 1.9258, "step": 613000 }, { "epoch": 0.6357759624912891, "grad_norm": 1.8194416761398315, "learning_rate": 5e-05, "loss": 1.9158, "step": 614000 }, { "epoch": 0.6368114282282457, "grad_norm": 1.3283801078796387, "learning_rate": 5e-05, "loss": 1.909, "step": 615000 }, { "epoch": 0.6368114282282457, "eval_loss": 1.9115009307861328, "eval_runtime": 20.6331, "eval_samples_per_second": 2396.974, "eval_steps_per_second": 9.402, "step": 615000 }, { "epoch": 0.6378468939652021, "grad_norm": 1.2602663040161133, "learning_rate": 5e-05, "loss": 1.9231, "step": 616000 }, { "epoch": 0.6388823597021587, "grad_norm": 1.8703596591949463, "learning_rate": 5e-05, "loss": 1.9159, "step": 617000 }, { "epoch": 0.6399178254391151, "grad_norm": 1.7976280450820923, "learning_rate": 5e-05, "loss": 1.9194, "step": 618000 }, { "epoch": 0.6409532911760716, "grad_norm": 1.4662623405456543, "learning_rate": 5e-05, "loss": 1.912, "step": 619000 }, { "epoch": 0.6419887569130281, "grad_norm": 1.8969612121582031, "learning_rate": 5e-05, "loss": 1.9232, "step": 620000 }, { "epoch": 0.6419887569130281, "eval_loss": 1.9057178497314453, "eval_runtime": 21.3901, "eval_samples_per_second": 2312.146, "eval_steps_per_second": 9.07, "step": 620000 }, { "epoch": 0.6430242226499846, "grad_norm": 1.6290833950042725, "learning_rate": 5e-05, "loss": 1.9103, "step": 621000 }, { "epoch": 0.6440596883869412, "grad_norm": 1.7869369983673096, "learning_rate": 5e-05, "loss": 1.9181, "step": 622000 }, { "epoch": 0.6450951541238976, "grad_norm": 1.555368185043335, "learning_rate": 5e-05, "loss": 1.9203, "step": 623000 }, { "epoch": 0.6461306198608541, "grad_norm": 1.8794424533843994, "learning_rate": 5e-05, "loss": 1.918, "step": 624000 }, { "epoch": 0.6471660855978106, "grad_norm": 1.838217854499817, "learning_rate": 5e-05, "loss": 1.9268, "step": 625000 }, { "epoch": 0.6471660855978106, "eval_loss": 1.9147250652313232, "eval_runtime": 20.7607, "eval_samples_per_second": 2382.241, "eval_steps_per_second": 9.345, "step": 625000 }, { "epoch": 0.6482015513347671, "grad_norm": 1.9131754636764526, "learning_rate": 5e-05, "loss": 1.913, "step": 626000 }, { "epoch": 0.6492370170717237, "grad_norm": 1.3449556827545166, "learning_rate": 5e-05, "loss": 1.9134, "step": 627000 }, { "epoch": 0.6502724828086801, "grad_norm": 1.7116332054138184, "learning_rate": 5e-05, "loss": 1.92, "step": 628000 }, { "epoch": 0.6513079485456366, "grad_norm": 1.9661062955856323, "learning_rate": 5e-05, "loss": 1.9181, "step": 629000 }, { "epoch": 0.6523434142825931, "grad_norm": 2.0663058757781982, "learning_rate": 5e-05, "loss": 1.9209, "step": 630000 }, { "epoch": 0.6523434142825931, "eval_loss": 1.9034740924835205, "eval_runtime": 21.31, "eval_samples_per_second": 2320.84, "eval_steps_per_second": 9.104, "step": 630000 }, { "epoch": 0.6533788800195496, "grad_norm": 1.7314753532409668, "learning_rate": 5e-05, "loss": 1.9258, "step": 631000 }, { "epoch": 0.6544143457565061, "grad_norm": 1.4399973154067993, "learning_rate": 5e-05, "loss": 1.9185, "step": 632000 }, { "epoch": 0.6554498114934626, "grad_norm": 1.7365604639053345, "learning_rate": 5e-05, "loss": 1.9213, "step": 633000 }, { "epoch": 0.656485277230419, "grad_norm": 1.6954264640808105, "learning_rate": 5e-05, "loss": 1.9203, "step": 634000 }, { "epoch": 0.6575207429673756, "grad_norm": 1.5337830781936646, "learning_rate": 5e-05, "loss": 1.9206, "step": 635000 }, { "epoch": 0.6575207429673756, "eval_loss": 1.8993173837661743, "eval_runtime": 20.4063, "eval_samples_per_second": 2423.615, "eval_steps_per_second": 9.507, "step": 635000 }, { "epoch": 0.6585562087043321, "grad_norm": 1.5964773893356323, "learning_rate": 5e-05, "loss": 1.9141, "step": 636000 }, { "epoch": 0.6595916744412885, "grad_norm": 1.443393588066101, "learning_rate": 5e-05, "loss": 1.9231, "step": 637000 }, { "epoch": 0.6606271401782451, "grad_norm": 1.5183662176132202, "learning_rate": 5e-05, "loss": 1.9169, "step": 638000 }, { "epoch": 0.6616626059152015, "grad_norm": 1.3888758420944214, "learning_rate": 5e-05, "loss": 1.9232, "step": 639000 }, { "epoch": 0.6626980716521581, "grad_norm": 2.3498077392578125, "learning_rate": 5e-05, "loss": 1.9112, "step": 640000 }, { "epoch": 0.6626980716521581, "eval_loss": 1.9156203269958496, "eval_runtime": 21.2557, "eval_samples_per_second": 2326.767, "eval_steps_per_second": 9.127, "step": 640000 }, { "epoch": 0.6637335373891146, "grad_norm": 1.3806946277618408, "learning_rate": 5e-05, "loss": 1.9239, "step": 641000 }, { "epoch": 0.664769003126071, "grad_norm": 1.379116177558899, "learning_rate": 5e-05, "loss": 1.9193, "step": 642000 }, { "epoch": 0.6658044688630276, "grad_norm": 1.5152908563613892, "learning_rate": 5e-05, "loss": 1.9157, "step": 643000 }, { "epoch": 0.666839934599984, "grad_norm": 1.9281625747680664, "learning_rate": 5e-05, "loss": 1.9223, "step": 644000 }, { "epoch": 0.6678754003369406, "grad_norm": 1.6093673706054688, "learning_rate": 5e-05, "loss": 1.9197, "step": 645000 }, { "epoch": 0.6678754003369406, "eval_loss": 1.9053946733474731, "eval_runtime": 20.5485, "eval_samples_per_second": 2406.846, "eval_steps_per_second": 9.441, "step": 645000 }, { "epoch": 0.668910866073897, "grad_norm": 1.7991442680358887, "learning_rate": 5e-05, "loss": 1.9208, "step": 646000 }, { "epoch": 0.6699463318108535, "grad_norm": 1.7188022136688232, "learning_rate": 5e-05, "loss": 1.9274, "step": 647000 }, { "epoch": 0.6709817975478101, "grad_norm": 1.4109748601913452, "learning_rate": 5e-05, "loss": 1.9107, "step": 648000 }, { "epoch": 0.6720172632847665, "grad_norm": 1.8776260614395142, "learning_rate": 5e-05, "loss": 1.9152, "step": 649000 }, { "epoch": 0.6730527290217231, "grad_norm": 1.761953592300415, "learning_rate": 5e-05, "loss": 1.9088, "step": 650000 }, { "epoch": 0.6730527290217231, "eval_loss": 1.902655005455017, "eval_runtime": 21.2187, "eval_samples_per_second": 2330.819, "eval_steps_per_second": 9.143, "step": 650000 }, { "epoch": 0.6740881947586795, "grad_norm": 1.4429457187652588, "learning_rate": 5e-05, "loss": 1.913, "step": 651000 }, { "epoch": 0.675123660495636, "grad_norm": 1.3650360107421875, "learning_rate": 5e-05, "loss": 1.9099, "step": 652000 }, { "epoch": 0.6761591262325926, "grad_norm": 1.3658324480056763, "learning_rate": 5e-05, "loss": 1.9249, "step": 653000 }, { "epoch": 0.677194591969549, "grad_norm": 2.063399076461792, "learning_rate": 5e-05, "loss": 1.9176, "step": 654000 }, { "epoch": 0.6782300577065056, "grad_norm": 1.9073212146759033, "learning_rate": 5e-05, "loss": 1.9055, "step": 655000 }, { "epoch": 0.6782300577065056, "eval_loss": 1.9057023525238037, "eval_runtime": 20.7336, "eval_samples_per_second": 2385.35, "eval_steps_per_second": 9.357, "step": 655000 }, { "epoch": 0.679265523443462, "grad_norm": 1.8441895246505737, "learning_rate": 5e-05, "loss": 1.9204, "step": 656000 }, { "epoch": 0.6803009891804185, "grad_norm": 1.539784550666809, "learning_rate": 5e-05, "loss": 1.9236, "step": 657000 }, { "epoch": 0.681336454917375, "grad_norm": 1.7579704523086548, "learning_rate": 5e-05, "loss": 1.8967, "step": 658000 }, { "epoch": 0.6823719206543315, "grad_norm": 1.5053812265396118, "learning_rate": 5e-05, "loss": 1.9077, "step": 659000 }, { "epoch": 0.683407386391288, "grad_norm": 1.3976589441299438, "learning_rate": 5e-05, "loss": 1.9107, "step": 660000 }, { "epoch": 0.683407386391288, "eval_loss": 1.9005506038665771, "eval_runtime": 21.2622, "eval_samples_per_second": 2326.058, "eval_steps_per_second": 9.124, "step": 660000 }, { "epoch": 0.6844428521282445, "grad_norm": 1.442474603652954, "learning_rate": 5e-05, "loss": 1.8983, "step": 661000 }, { "epoch": 0.685478317865201, "grad_norm": 1.574547290802002, "learning_rate": 5e-05, "loss": 1.9212, "step": 662000 }, { "epoch": 0.6865137836021575, "grad_norm": 1.293898105621338, "learning_rate": 5e-05, "loss": 1.9139, "step": 663000 }, { "epoch": 0.687549249339114, "grad_norm": 1.7832250595092773, "learning_rate": 5e-05, "loss": 1.9123, "step": 664000 }, { "epoch": 0.6885847150760704, "grad_norm": 1.4399775266647339, "learning_rate": 5e-05, "loss": 1.9073, "step": 665000 }, { "epoch": 0.6885847150760704, "eval_loss": 1.895343542098999, "eval_runtime": 20.1859, "eval_samples_per_second": 2450.071, "eval_steps_per_second": 9.611, "step": 665000 }, { "epoch": 0.689620180813027, "grad_norm": 1.6234581470489502, "learning_rate": 5e-05, "loss": 1.9009, "step": 666000 }, { "epoch": 0.6906556465499835, "grad_norm": 1.2184017896652222, "learning_rate": 5e-05, "loss": 1.9023, "step": 667000 }, { "epoch": 0.69169111228694, "grad_norm": 1.973521113395691, "learning_rate": 5e-05, "loss": 1.8883, "step": 668000 }, { "epoch": 0.6927265780238965, "grad_norm": 1.455173373222351, "learning_rate": 5e-05, "loss": 1.9057, "step": 669000 }, { "epoch": 0.6937620437608529, "grad_norm": 1.7372217178344727, "learning_rate": 5e-05, "loss": 1.9045, "step": 670000 }, { "epoch": 0.6937620437608529, "eval_loss": 1.8927185535430908, "eval_runtime": 21.4642, "eval_samples_per_second": 2304.16, "eval_steps_per_second": 9.038, "step": 670000 }, { "epoch": 0.6947975094978095, "grad_norm": 1.8958580493927002, "learning_rate": 5e-05, "loss": 1.897, "step": 671000 }, { "epoch": 0.695832975234766, "grad_norm": 1.4989745616912842, "learning_rate": 5e-05, "loss": 1.9102, "step": 672000 }, { "epoch": 0.6968684409717225, "grad_norm": 1.7313189506530762, "learning_rate": 5e-05, "loss": 1.9097, "step": 673000 }, { "epoch": 0.697903906708679, "grad_norm": 1.197485327720642, "learning_rate": 5e-05, "loss": 1.8999, "step": 674000 }, { "epoch": 0.6989393724456354, "grad_norm": 1.484868049621582, "learning_rate": 5e-05, "loss": 1.9096, "step": 675000 }, { "epoch": 0.6989393724456354, "eval_loss": 1.897947907447815, "eval_runtime": 21.0415, "eval_samples_per_second": 2350.453, "eval_steps_per_second": 9.22, "step": 675000 }, { "epoch": 0.699974838182592, "grad_norm": 1.7760064601898193, "learning_rate": 5e-05, "loss": 1.9048, "step": 676000 }, { "epoch": 0.7010103039195484, "grad_norm": 2.166504383087158, "learning_rate": 5e-05, "loss": 1.9077, "step": 677000 }, { "epoch": 0.702045769656505, "grad_norm": 1.9204415082931519, "learning_rate": 5e-05, "loss": 1.9092, "step": 678000 }, { "epoch": 0.7030812353934615, "grad_norm": 1.7995051145553589, "learning_rate": 5e-05, "loss": 1.912, "step": 679000 }, { "epoch": 0.7041167011304179, "grad_norm": 1.947636604309082, "learning_rate": 5e-05, "loss": 1.9136, "step": 680000 }, { "epoch": 0.7041167011304179, "eval_loss": 1.896292805671692, "eval_runtime": 20.6404, "eval_samples_per_second": 2396.129, "eval_steps_per_second": 9.399, "step": 680000 }, { "epoch": 0.7051521668673745, "grad_norm": 1.4779621362686157, "learning_rate": 5e-05, "loss": 1.9079, "step": 681000 }, { "epoch": 0.7061876326043309, "grad_norm": 1.6015859842300415, "learning_rate": 5e-05, "loss": 1.9054, "step": 682000 }, { "epoch": 0.7072230983412874, "grad_norm": 1.571121335029602, "learning_rate": 5e-05, "loss": 1.9174, "step": 683000 }, { "epoch": 0.708258564078244, "grad_norm": 1.3641341924667358, "learning_rate": 5e-05, "loss": 1.9003, "step": 684000 }, { "epoch": 0.7092940298152004, "grad_norm": 1.9284065961837769, "learning_rate": 5e-05, "loss": 1.9146, "step": 685000 }, { "epoch": 0.7092940298152004, "eval_loss": 1.892472743988037, "eval_runtime": 23.451, "eval_samples_per_second": 2108.955, "eval_steps_per_second": 8.273, "step": 685000 }, { "epoch": 0.710329495552157, "grad_norm": 1.8129565715789795, "learning_rate": 5e-05, "loss": 1.912, "step": 686000 }, { "epoch": 0.7113649612891134, "grad_norm": 1.6975070238113403, "learning_rate": 5e-05, "loss": 1.8925, "step": 687000 }, { "epoch": 0.7124004270260699, "grad_norm": 1.4034419059753418, "learning_rate": 5e-05, "loss": 1.9091, "step": 688000 }, { "epoch": 0.7134358927630264, "grad_norm": 2.147268056869507, "learning_rate": 5e-05, "loss": 1.9041, "step": 689000 }, { "epoch": 0.7144713584999829, "grad_norm": 2.014146327972412, "learning_rate": 5e-05, "loss": 1.8952, "step": 690000 }, { "epoch": 0.7144713584999829, "eval_loss": 1.9015657901763916, "eval_runtime": 18.7155, "eval_samples_per_second": 2642.575, "eval_steps_per_second": 10.366, "step": 690000 }, { "epoch": 0.7155068242369395, "grad_norm": 1.773491621017456, "learning_rate": 5e-05, "loss": 1.9114, "step": 691000 }, { "epoch": 0.7165422899738959, "grad_norm": 1.6221060752868652, "learning_rate": 5e-05, "loss": 1.9072, "step": 692000 }, { "epoch": 0.7175777557108524, "grad_norm": 2.0108203887939453, "learning_rate": 5e-05, "loss": 1.8966, "step": 693000 }, { "epoch": 0.7186132214478089, "grad_norm": 1.7623751163482666, "learning_rate": 5e-05, "loss": 1.8931, "step": 694000 }, { "epoch": 0.7196486871847654, "grad_norm": 1.6833388805389404, "learning_rate": 5e-05, "loss": 1.9088, "step": 695000 }, { "epoch": 0.7196486871847654, "eval_loss": 1.8960736989974976, "eval_runtime": 20.0395, "eval_samples_per_second": 2467.97, "eval_steps_per_second": 9.681, "step": 695000 }, { "epoch": 0.7206841529217219, "grad_norm": 1.7570165395736694, "learning_rate": 5e-05, "loss": 1.8961, "step": 696000 }, { "epoch": 0.7217196186586784, "grad_norm": 1.8349229097366333, "learning_rate": 5e-05, "loss": 1.9142, "step": 697000 }, { "epoch": 0.7227550843956349, "grad_norm": 1.9120980501174927, "learning_rate": 5e-05, "loss": 1.9123, "step": 698000 }, { "epoch": 0.7237905501325914, "grad_norm": 1.4346221685409546, "learning_rate": 5e-05, "loss": 1.9, "step": 699000 }, { "epoch": 0.7248260158695479, "grad_norm": 1.5059664249420166, "learning_rate": 5e-05, "loss": 1.9088, "step": 700000 }, { "epoch": 0.7248260158695479, "eval_loss": 1.8930763006210327, "eval_runtime": 20.3037, "eval_samples_per_second": 2435.865, "eval_steps_per_second": 9.555, "step": 700000 }, { "epoch": 0.7258614816065044, "grad_norm": 1.4954737424850464, "learning_rate": 5e-05, "loss": 1.8894, "step": 701000 }, { "epoch": 0.7268969473434609, "grad_norm": 2.206475257873535, "learning_rate": 5e-05, "loss": 1.897, "step": 702000 }, { "epoch": 0.7279324130804173, "grad_norm": 1.39924156665802, "learning_rate": 5e-05, "loss": 1.9081, "step": 703000 }, { "epoch": 0.7289678788173739, "grad_norm": 1.6110947132110596, "learning_rate": 5e-05, "loss": 1.8973, "step": 704000 }, { "epoch": 0.7300033445543304, "grad_norm": 1.7667462825775146, "learning_rate": 5e-05, "loss": 1.893, "step": 705000 }, { "epoch": 0.7300033445543304, "eval_loss": 1.8968194723129272, "eval_runtime": 21.1571, "eval_samples_per_second": 2337.611, "eval_steps_per_second": 9.17, "step": 705000 }, { "epoch": 0.7310388102912868, "grad_norm": 1.4846593141555786, "learning_rate": 5e-05, "loss": 1.8858, "step": 706000 }, { "epoch": 0.7320742760282434, "grad_norm": 1.5730334520339966, "learning_rate": 5e-05, "loss": 1.902, "step": 707000 }, { "epoch": 0.7331097417651998, "grad_norm": 2.5404248237609863, "learning_rate": 5e-05, "loss": 1.8896, "step": 708000 }, { "epoch": 0.7341452075021564, "grad_norm": 2.4605815410614014, "learning_rate": 5e-05, "loss": 1.9059, "step": 709000 }, { "epoch": 0.7351806732391128, "grad_norm": 1.3232377767562866, "learning_rate": 5e-05, "loss": 1.9015, "step": 710000 }, { "epoch": 0.7351806732391128, "eval_loss": 1.8934879302978516, "eval_runtime": 21.4904, "eval_samples_per_second": 2301.348, "eval_steps_per_second": 9.027, "step": 710000 }, { "epoch": 0.7362161389760693, "grad_norm": 1.9923709630966187, "learning_rate": 5e-05, "loss": 1.8914, "step": 711000 }, { "epoch": 0.7372516047130259, "grad_norm": 1.58742094039917, "learning_rate": 5e-05, "loss": 1.913, "step": 712000 }, { "epoch": 0.7382870704499823, "grad_norm": 1.5676469802856445, "learning_rate": 5e-05, "loss": 1.8864, "step": 713000 }, { "epoch": 0.7393225361869389, "grad_norm": 1.5205984115600586, "learning_rate": 5e-05, "loss": 1.8859, "step": 714000 }, { "epoch": 0.7403580019238953, "grad_norm": 1.9802634716033936, "learning_rate": 5e-05, "loss": 1.8972, "step": 715000 }, { "epoch": 0.7403580019238953, "eval_loss": 1.8818447589874268, "eval_runtime": 32.1303, "eval_samples_per_second": 1539.266, "eval_steps_per_second": 6.038, "step": 715000 }, { "epoch": 0.7413934676608518, "grad_norm": 1.7090638875961304, "learning_rate": 5e-05, "loss": 1.8931, "step": 716000 }, { "epoch": 0.7424289333978084, "grad_norm": 1.3374643325805664, "learning_rate": 5e-05, "loss": 1.9076, "step": 717000 }, { "epoch": 0.7434643991347648, "grad_norm": 1.7601673603057861, "learning_rate": 5e-05, "loss": 1.9031, "step": 718000 }, { "epoch": 0.7444998648717214, "grad_norm": 1.3686988353729248, "learning_rate": 5e-05, "loss": 1.8969, "step": 719000 }, { "epoch": 0.7455353306086778, "grad_norm": 1.6470602750778198, "learning_rate": 5e-05, "loss": 1.8986, "step": 720000 }, { "epoch": 0.7455353306086778, "eval_loss": 1.8832844495773315, "eval_runtime": 20.1206, "eval_samples_per_second": 2458.028, "eval_steps_per_second": 9.642, "step": 720000 }, { "epoch": 0.7465707963456343, "grad_norm": 1.6907163858413696, "learning_rate": 5e-05, "loss": 1.8998, "step": 721000 }, { "epoch": 0.7476062620825908, "grad_norm": 1.5861003398895264, "learning_rate": 5e-05, "loss": 1.9007, "step": 722000 }, { "epoch": 0.7486417278195473, "grad_norm": 1.5830234289169312, "learning_rate": 5e-05, "loss": 1.8931, "step": 723000 }, { "epoch": 0.7496771935565039, "grad_norm": 1.778434157371521, "learning_rate": 5e-05, "loss": 1.9, "step": 724000 }, { "epoch": 0.7507126592934603, "grad_norm": 1.8183609247207642, "learning_rate": 5e-05, "loss": 1.8913, "step": 725000 }, { "epoch": 0.7507126592934603, "eval_loss": 1.8839964866638184, "eval_runtime": 24.8386, "eval_samples_per_second": 1991.138, "eval_steps_per_second": 7.81, "step": 725000 }, { "epoch": 0.7517481250304168, "grad_norm": 1.8275715112686157, "learning_rate": 5e-05, "loss": 1.8965, "step": 726000 }, { "epoch": 0.7527835907673733, "grad_norm": 1.6312036514282227, "learning_rate": 5e-05, "loss": 1.8781, "step": 727000 }, { "epoch": 0.7538190565043298, "grad_norm": 1.813920259475708, "learning_rate": 5e-05, "loss": 1.8957, "step": 728000 }, { "epoch": 0.7548545222412864, "grad_norm": 1.5408565998077393, "learning_rate": 5e-05, "loss": 1.8962, "step": 729000 }, { "epoch": 0.7558899879782428, "grad_norm": 1.43170964717865, "learning_rate": 5e-05, "loss": 1.9008, "step": 730000 }, { "epoch": 0.7558899879782428, "eval_loss": 1.8895530700683594, "eval_runtime": 23.6536, "eval_samples_per_second": 2090.887, "eval_steps_per_second": 8.202, "step": 730000 }, { "epoch": 0.7569254537151993, "grad_norm": 1.4100663661956787, "learning_rate": 5e-05, "loss": 1.8896, "step": 731000 }, { "epoch": 0.7579609194521558, "grad_norm": 1.88717782497406, "learning_rate": 5e-05, "loss": 1.897, "step": 732000 }, { "epoch": 0.7589963851891123, "grad_norm": 1.6897785663604736, "learning_rate": 5e-05, "loss": 1.8866, "step": 733000 }, { "epoch": 0.7600318509260687, "grad_norm": 1.6449230909347534, "learning_rate": 5e-05, "loss": 1.9059, "step": 734000 }, { "epoch": 0.7610673166630253, "grad_norm": 1.682363510131836, "learning_rate": 5e-05, "loss": 1.9016, "step": 735000 }, { "epoch": 0.7610673166630253, "eval_loss": 1.8788481950759888, "eval_runtime": 23.8164, "eval_samples_per_second": 2076.595, "eval_steps_per_second": 8.146, "step": 735000 }, { "epoch": 0.7621027823999817, "grad_norm": 1.388289213180542, "learning_rate": 5e-05, "loss": 1.9026, "step": 736000 }, { "epoch": 0.7631382481369383, "grad_norm": 1.4822994470596313, "learning_rate": 5e-05, "loss": 1.8702, "step": 737000 }, { "epoch": 0.7641737138738948, "grad_norm": 1.3337068557739258, "learning_rate": 5e-05, "loss": 1.8845, "step": 738000 }, { "epoch": 0.7652091796108512, "grad_norm": 1.6779500246047974, "learning_rate": 5e-05, "loss": 1.8853, "step": 739000 }, { "epoch": 0.7662446453478078, "grad_norm": 1.7902178764343262, "learning_rate": 5e-05, "loss": 1.8996, "step": 740000 }, { "epoch": 0.7662446453478078, "eval_loss": 1.8781044483184814, "eval_runtime": 24.6681, "eval_samples_per_second": 2004.9, "eval_steps_per_second": 7.864, "step": 740000 }, { "epoch": 0.7672801110847642, "grad_norm": 1.4279536008834839, "learning_rate": 5e-05, "loss": 1.8891, "step": 741000 }, { "epoch": 0.7683155768217208, "grad_norm": 1.4116427898406982, "learning_rate": 5e-05, "loss": 1.9103, "step": 742000 }, { "epoch": 0.7693510425586773, "grad_norm": 1.2686455249786377, "learning_rate": 5e-05, "loss": 1.8768, "step": 743000 }, { "epoch": 0.7703865082956337, "grad_norm": 1.9228949546813965, "learning_rate": 5e-05, "loss": 1.8958, "step": 744000 }, { "epoch": 0.7714219740325903, "grad_norm": 1.6242496967315674, "learning_rate": 5e-05, "loss": 1.8885, "step": 745000 }, { "epoch": 0.7714219740325903, "eval_loss": 1.8771369457244873, "eval_runtime": 24.8877, "eval_samples_per_second": 1987.21, "eval_steps_per_second": 7.795, "step": 745000 }, { "epoch": 0.7724574397695467, "grad_norm": 1.3284131288528442, "learning_rate": 5e-05, "loss": 1.8882, "step": 746000 }, { "epoch": 0.7734929055065033, "grad_norm": 2.2554099559783936, "learning_rate": 5e-05, "loss": 1.8915, "step": 747000 }, { "epoch": 0.7745283712434597, "grad_norm": 1.8585548400878906, "learning_rate": 5e-05, "loss": 1.8901, "step": 748000 }, { "epoch": 0.7755638369804162, "grad_norm": 1.4987225532531738, "learning_rate": 5e-05, "loss": 1.8888, "step": 749000 }, { "epoch": 0.7765993027173728, "grad_norm": 1.642417550086975, "learning_rate": 5e-05, "loss": 1.8925, "step": 750000 }, { "epoch": 0.7765993027173728, "eval_loss": 1.8830502033233643, "eval_runtime": 24.3276, "eval_samples_per_second": 2032.956, "eval_steps_per_second": 7.974, "step": 750000 }, { "epoch": 0.7776347684543292, "grad_norm": 1.4527113437652588, "learning_rate": 5e-05, "loss": 1.886, "step": 751000 }, { "epoch": 0.7786702341912858, "grad_norm": 1.6376516819000244, "learning_rate": 5e-05, "loss": 1.8949, "step": 752000 }, { "epoch": 0.7797056999282422, "grad_norm": 1.3692593574523926, "learning_rate": 5e-05, "loss": 1.8791, "step": 753000 }, { "epoch": 0.7807411656651987, "grad_norm": 1.4702037572860718, "learning_rate": 5e-05, "loss": 1.8893, "step": 754000 }, { "epoch": 0.7817766314021553, "grad_norm": 1.5268641710281372, "learning_rate": 5e-05, "loss": 1.8845, "step": 755000 }, { "epoch": 0.7817766314021553, "eval_loss": 1.8741161823272705, "eval_runtime": 23.2405, "eval_samples_per_second": 2128.054, "eval_steps_per_second": 8.348, "step": 755000 }, { "epoch": 0.7828120971391117, "grad_norm": 1.5007132291793823, "learning_rate": 5e-05, "loss": 1.8857, "step": 756000 }, { "epoch": 0.7838475628760682, "grad_norm": 1.7741094827651978, "learning_rate": 5e-05, "loss": 1.9001, "step": 757000 }, { "epoch": 0.7848830286130247, "grad_norm": 1.789960503578186, "learning_rate": 5e-05, "loss": 1.8887, "step": 758000 }, { "epoch": 0.7859184943499812, "grad_norm": 1.7130295038223267, "learning_rate": 5e-05, "loss": 1.8781, "step": 759000 }, { "epoch": 0.7869539600869377, "grad_norm": 1.8921586275100708, "learning_rate": 5e-05, "loss": 1.8832, "step": 760000 }, { "epoch": 0.7869539600869377, "eval_loss": 1.8890156745910645, "eval_runtime": 23.6747, "eval_samples_per_second": 2089.023, "eval_steps_per_second": 8.194, "step": 760000 }, { "epoch": 0.7879894258238942, "grad_norm": 1.4963884353637695, "learning_rate": 5e-05, "loss": 1.8822, "step": 761000 }, { "epoch": 0.7890248915608506, "grad_norm": 1.3713665008544922, "learning_rate": 5e-05, "loss": 1.8814, "step": 762000 }, { "epoch": 0.7900603572978072, "grad_norm": 2.0718443393707275, "learning_rate": 5e-05, "loss": 1.8794, "step": 763000 }, { "epoch": 0.7910958230347637, "grad_norm": 2.4328157901763916, "learning_rate": 5e-05, "loss": 1.8772, "step": 764000 }, { "epoch": 0.7921312887717202, "grad_norm": 1.6719129085540771, "learning_rate": 5e-05, "loss": 1.8859, "step": 765000 }, { "epoch": 0.7921312887717202, "eval_loss": 1.878068447113037, "eval_runtime": 24.513, "eval_samples_per_second": 2017.582, "eval_steps_per_second": 7.914, "step": 765000 }, { "epoch": 0.7931667545086767, "grad_norm": 1.5634863376617432, "learning_rate": 5e-05, "loss": 1.8863, "step": 766000 }, { "epoch": 0.7942022202456331, "grad_norm": 2.2358057498931885, "learning_rate": 5e-05, "loss": 1.8882, "step": 767000 }, { "epoch": 0.7952376859825897, "grad_norm": 1.6573256254196167, "learning_rate": 5e-05, "loss": 1.8934, "step": 768000 }, { "epoch": 0.7962731517195462, "grad_norm": 1.738838791847229, "learning_rate": 5e-05, "loss": 1.8819, "step": 769000 }, { "epoch": 0.7973086174565027, "grad_norm": 1.400238275527954, "learning_rate": 5e-05, "loss": 1.8845, "step": 770000 }, { "epoch": 0.7973086174565027, "eval_loss": 1.871466040611267, "eval_runtime": 26.2147, "eval_samples_per_second": 1886.614, "eval_steps_per_second": 7.4, "step": 770000 }, { "epoch": 0.7983440831934592, "grad_norm": 1.506074070930481, "learning_rate": 5e-05, "loss": 1.8833, "step": 771000 }, { "epoch": 0.7993795489304156, "grad_norm": 1.7725430727005005, "learning_rate": 5e-05, "loss": 1.8825, "step": 772000 }, { "epoch": 0.8004150146673722, "grad_norm": 1.2773728370666504, "learning_rate": 5e-05, "loss": 1.8827, "step": 773000 }, { "epoch": 0.8014504804043286, "grad_norm": 1.9510735273361206, "learning_rate": 5e-05, "loss": 1.8797, "step": 774000 }, { "epoch": 0.8024859461412852, "grad_norm": 1.7016202211380005, "learning_rate": 5e-05, "loss": 1.8922, "step": 775000 }, { "epoch": 0.8024859461412852, "eval_loss": 1.8798218965530396, "eval_runtime": 27.438, "eval_samples_per_second": 1802.497, "eval_steps_per_second": 7.07, "step": 775000 }, { "epoch": 0.8035214118782417, "grad_norm": 1.8934874534606934, "learning_rate": 5e-05, "loss": 1.8825, "step": 776000 }, { "epoch": 0.8045568776151981, "grad_norm": 1.5149208307266235, "learning_rate": 5e-05, "loss": 1.8937, "step": 777000 }, { "epoch": 0.8055923433521547, "grad_norm": 1.3739936351776123, "learning_rate": 5e-05, "loss": 1.8713, "step": 778000 }, { "epoch": 0.8066278090891111, "grad_norm": 1.7757251262664795, "learning_rate": 5e-05, "loss": 1.8805, "step": 779000 }, { "epoch": 0.8076632748260676, "grad_norm": 1.7812187671661377, "learning_rate": 5e-05, "loss": 1.8826, "step": 780000 }, { "epoch": 0.8076632748260676, "eval_loss": 1.865774393081665, "eval_runtime": 27.1199, "eval_samples_per_second": 1823.645, "eval_steps_per_second": 7.153, "step": 780000 }, { "epoch": 0.8086987405630242, "grad_norm": 1.4904496669769287, "learning_rate": 5e-05, "loss": 1.8807, "step": 781000 }, { "epoch": 0.8097342062999806, "grad_norm": 1.5127277374267578, "learning_rate": 5e-05, "loss": 1.8754, "step": 782000 }, { "epoch": 0.8107696720369372, "grad_norm": 1.8934956789016724, "learning_rate": 5e-05, "loss": 1.8928, "step": 783000 }, { "epoch": 0.8118051377738936, "grad_norm": 1.9401856660842896, "learning_rate": 5e-05, "loss": 1.8839, "step": 784000 }, { "epoch": 0.8128406035108501, "grad_norm": 1.5946576595306396, "learning_rate": 5e-05, "loss": 1.8714, "step": 785000 }, { "epoch": 0.8128406035108501, "eval_loss": 1.8689470291137695, "eval_runtime": 27.4211, "eval_samples_per_second": 1803.611, "eval_steps_per_second": 7.075, "step": 785000 }, { "epoch": 0.8138760692478066, "grad_norm": 1.7815802097320557, "learning_rate": 5e-05, "loss": 1.8843, "step": 786000 }, { "epoch": 0.8149115349847631, "grad_norm": 1.684421420097351, "learning_rate": 5e-05, "loss": 1.8895, "step": 787000 }, { "epoch": 0.8159470007217197, "grad_norm": 1.7290705442428589, "learning_rate": 5e-05, "loss": 1.8737, "step": 788000 }, { "epoch": 0.8169824664586761, "grad_norm": 1.3579429388046265, "learning_rate": 5e-05, "loss": 1.8867, "step": 789000 }, { "epoch": 0.8180179321956326, "grad_norm": 1.4426133632659912, "learning_rate": 5e-05, "loss": 1.8792, "step": 790000 }, { "epoch": 0.8180179321956326, "eval_loss": 1.8706306219100952, "eval_runtime": 25.2456, "eval_samples_per_second": 1959.031, "eval_steps_per_second": 7.684, "step": 790000 }, { "epoch": 0.8190533979325891, "grad_norm": 1.7028056383132935, "learning_rate": 5e-05, "loss": 1.8876, "step": 791000 }, { "epoch": 0.8200888636695456, "grad_norm": 1.6472172737121582, "learning_rate": 5e-05, "loss": 1.8853, "step": 792000 }, { "epoch": 0.8211243294065022, "grad_norm": 1.6392238140106201, "learning_rate": 5e-05, "loss": 1.8763, "step": 793000 }, { "epoch": 0.8221597951434586, "grad_norm": 1.2687627077102661, "learning_rate": 5e-05, "loss": 1.8754, "step": 794000 }, { "epoch": 0.8231952608804151, "grad_norm": 1.4193922281265259, "learning_rate": 5e-05, "loss": 1.8778, "step": 795000 }, { "epoch": 0.8231952608804151, "eval_loss": 1.8760771751403809, "eval_runtime": 26.3623, "eval_samples_per_second": 1876.047, "eval_steps_per_second": 7.359, "step": 795000 }, { "epoch": 0.8242307266173716, "grad_norm": 1.3184908628463745, "learning_rate": 5e-05, "loss": 1.8851, "step": 796000 }, { "epoch": 0.8252661923543281, "grad_norm": 1.2626770734786987, "learning_rate": 5e-05, "loss": 1.8622, "step": 797000 }, { "epoch": 0.8263016580912846, "grad_norm": 1.768161416053772, "learning_rate": 5e-05, "loss": 1.8822, "step": 798000 }, { "epoch": 0.8273371238282411, "grad_norm": 2.2609405517578125, "learning_rate": 5e-05, "loss": 1.8776, "step": 799000 }, { "epoch": 0.8283725895651975, "grad_norm": 1.4166158437728882, "learning_rate": 5e-05, "loss": 1.8834, "step": 800000 }, { "epoch": 0.8283725895651975, "eval_loss": 1.874098777770996, "eval_runtime": 27.908, "eval_samples_per_second": 1772.144, "eval_steps_per_second": 6.951, "step": 800000 }, { "epoch": 0.8294080553021541, "grad_norm": 1.5788943767547607, "learning_rate": 5e-05, "loss": 1.8854, "step": 801000 }, { "epoch": 0.8304435210391106, "grad_norm": 1.682399034500122, "learning_rate": 5e-05, "loss": 1.89, "step": 802000 }, { "epoch": 0.831478986776067, "grad_norm": 1.3524305820465088, "learning_rate": 5e-05, "loss": 1.8893, "step": 803000 }, { "epoch": 0.8325144525130236, "grad_norm": 2.113900661468506, "learning_rate": 5e-05, "loss": 1.8842, "step": 804000 }, { "epoch": 0.83354991824998, "grad_norm": 1.3972655534744263, "learning_rate": 5e-05, "loss": 1.8868, "step": 805000 }, { "epoch": 0.83354991824998, "eval_loss": 1.8680431842803955, "eval_runtime": 32.3741, "eval_samples_per_second": 1527.672, "eval_steps_per_second": 5.992, "step": 805000 }, { "epoch": 0.8345853839869366, "grad_norm": 1.905885100364685, "learning_rate": 5e-05, "loss": 1.8913, "step": 806000 }, { "epoch": 0.835620849723893, "grad_norm": 1.618198037147522, "learning_rate": 5e-05, "loss": 1.8756, "step": 807000 }, { "epoch": 0.8366563154608495, "grad_norm": 1.955761432647705, "learning_rate": 5e-05, "loss": 1.8883, "step": 808000 }, { "epoch": 0.8376917811978061, "grad_norm": 1.9344592094421387, "learning_rate": 5e-05, "loss": 1.8843, "step": 809000 }, { "epoch": 0.8387272469347625, "grad_norm": 1.4266360998153687, "learning_rate": 5e-05, "loss": 1.8728, "step": 810000 }, { "epoch": 0.8387272469347625, "eval_loss": 1.8683350086212158, "eval_runtime": 24.7677, "eval_samples_per_second": 1996.836, "eval_steps_per_second": 7.833, "step": 810000 }, { "epoch": 0.8397627126717191, "grad_norm": 1.6188902854919434, "learning_rate": 5e-05, "loss": 1.8817, "step": 811000 }, { "epoch": 0.8407981784086755, "grad_norm": 1.6710392236709595, "learning_rate": 5e-05, "loss": 1.8732, "step": 812000 }, { "epoch": 0.841833644145632, "grad_norm": 1.4408323764801025, "learning_rate": 5e-05, "loss": 1.8763, "step": 813000 }, { "epoch": 0.8428691098825886, "grad_norm": 1.6970560550689697, "learning_rate": 5e-05, "loss": 1.8796, "step": 814000 }, { "epoch": 0.843904575619545, "grad_norm": 1.4276736974716187, "learning_rate": 5e-05, "loss": 1.8792, "step": 815000 }, { "epoch": 0.843904575619545, "eval_loss": 1.8652664422988892, "eval_runtime": 25.7373, "eval_samples_per_second": 1921.61, "eval_steps_per_second": 7.538, "step": 815000 }, { "epoch": 0.8449400413565016, "grad_norm": 1.2162322998046875, "learning_rate": 5e-05, "loss": 1.8751, "step": 816000 }, { "epoch": 0.845975507093458, "grad_norm": 1.6202760934829712, "learning_rate": 5e-05, "loss": 1.8746, "step": 817000 }, { "epoch": 0.8470109728304145, "grad_norm": 1.5370842218399048, "learning_rate": 5e-05, "loss": 1.8679, "step": 818000 }, { "epoch": 0.848046438567371, "grad_norm": 1.9520437717437744, "learning_rate": 5e-05, "loss": 1.8777, "step": 819000 }, { "epoch": 0.8490819043043275, "grad_norm": 1.6644766330718994, "learning_rate": 5e-05, "loss": 1.875, "step": 820000 }, { "epoch": 0.8490819043043275, "eval_loss": 1.8683598041534424, "eval_runtime": 25.0415, "eval_samples_per_second": 1975.005, "eval_steps_per_second": 7.747, "step": 820000 }, { "epoch": 0.8501173700412841, "grad_norm": 1.677675724029541, "learning_rate": 5e-05, "loss": 1.8768, "step": 821000 }, { "epoch": 0.8511528357782405, "grad_norm": 1.6333279609680176, "learning_rate": 5e-05, "loss": 1.8826, "step": 822000 }, { "epoch": 0.852188301515197, "grad_norm": 2.3194546699523926, "learning_rate": 5e-05, "loss": 1.8713, "step": 823000 }, { "epoch": 0.8532237672521535, "grad_norm": 1.945440411567688, "learning_rate": 5e-05, "loss": 1.8675, "step": 824000 }, { "epoch": 0.85425923298911, "grad_norm": 1.6451483964920044, "learning_rate": 5e-05, "loss": 1.8821, "step": 825000 }, { "epoch": 0.85425923298911, "eval_loss": 1.8589342832565308, "eval_runtime": 24.706, "eval_samples_per_second": 2001.819, "eval_steps_per_second": 7.852, "step": 825000 }, { "epoch": 0.8552946987260664, "grad_norm": 1.882265329360962, "learning_rate": 5e-05, "loss": 1.8867, "step": 826000 }, { "epoch": 0.856330164463023, "grad_norm": 1.4808956384658813, "learning_rate": 5e-05, "loss": 1.8727, "step": 827000 }, { "epoch": 0.8573656301999795, "grad_norm": 1.383360505104065, "learning_rate": 5e-05, "loss": 1.873, "step": 828000 }, { "epoch": 0.858401095936936, "grad_norm": 1.4626834392547607, "learning_rate": 5e-05, "loss": 1.8629, "step": 829000 }, { "epoch": 0.8594365616738925, "grad_norm": 1.668229341506958, "learning_rate": 5e-05, "loss": 1.8877, "step": 830000 }, { "epoch": 0.8594365616738925, "eval_loss": 1.8676975965499878, "eval_runtime": 24.738, "eval_samples_per_second": 1999.229, "eval_steps_per_second": 7.842, "step": 830000 }, { "epoch": 0.8604720274108489, "grad_norm": 1.9062445163726807, "learning_rate": 5e-05, "loss": 1.8692, "step": 831000 }, { "epoch": 0.8615074931478055, "grad_norm": 1.6044979095458984, "learning_rate": 5e-05, "loss": 1.8696, "step": 832000 }, { "epoch": 0.862542958884762, "grad_norm": 1.5497651100158691, "learning_rate": 5e-05, "loss": 1.8772, "step": 833000 }, { "epoch": 0.8635784246217185, "grad_norm": 2.272531747817993, "learning_rate": 5e-05, "loss": 1.8846, "step": 834000 }, { "epoch": 0.864613890358675, "grad_norm": 1.831063151359558, "learning_rate": 5e-05, "loss": 1.8684, "step": 835000 }, { "epoch": 0.864613890358675, "eval_loss": 1.8688569068908691, "eval_runtime": 24.7995, "eval_samples_per_second": 1994.276, "eval_steps_per_second": 7.823, "step": 835000 }, { "epoch": 0.8656493560956314, "grad_norm": 1.9683046340942383, "learning_rate": 5e-05, "loss": 1.8768, "step": 836000 }, { "epoch": 0.866684821832588, "grad_norm": 1.6770648956298828, "learning_rate": 5e-05, "loss": 1.8774, "step": 837000 }, { "epoch": 0.8677202875695444, "grad_norm": 1.9348267316818237, "learning_rate": 5e-05, "loss": 1.8816, "step": 838000 }, { "epoch": 0.868755753306501, "grad_norm": 1.5517618656158447, "learning_rate": 5e-05, "loss": 1.8761, "step": 839000 }, { "epoch": 0.8697912190434575, "grad_norm": 1.8172694444656372, "learning_rate": 5e-05, "loss": 1.8711, "step": 840000 }, { "epoch": 0.8697912190434575, "eval_loss": 1.8648767471313477, "eval_runtime": 21.0335, "eval_samples_per_second": 2351.339, "eval_steps_per_second": 9.223, "step": 840000 }, { "epoch": 0.8708266847804139, "grad_norm": 1.5013890266418457, "learning_rate": 5e-05, "loss": 1.8691, "step": 841000 }, { "epoch": 0.8718621505173705, "grad_norm": 1.9419187307357788, "learning_rate": 5e-05, "loss": 1.8883, "step": 842000 }, { "epoch": 0.8728976162543269, "grad_norm": 1.5654350519180298, "learning_rate": 5e-05, "loss": 1.8808, "step": 843000 }, { "epoch": 0.8739330819912835, "grad_norm": 1.599421739578247, "learning_rate": 5e-05, "loss": 1.8724, "step": 844000 }, { "epoch": 0.87496854772824, "grad_norm": 1.4926223754882812, "learning_rate": 5e-05, "loss": 1.8562, "step": 845000 }, { "epoch": 0.87496854772824, "eval_loss": 1.8575286865234375, "eval_runtime": 21.0794, "eval_samples_per_second": 2346.223, "eval_steps_per_second": 9.203, "step": 845000 }, { "epoch": 0.8760040134651964, "grad_norm": 1.4079915285110474, "learning_rate": 5e-05, "loss": 1.866, "step": 846000 }, { "epoch": 0.877039479202153, "grad_norm": 1.6430548429489136, "learning_rate": 5e-05, "loss": 1.8701, "step": 847000 }, { "epoch": 0.8780749449391094, "grad_norm": 1.7442820072174072, "learning_rate": 5e-05, "loss": 1.8801, "step": 848000 }, { "epoch": 0.879110410676066, "grad_norm": 1.5199944972991943, "learning_rate": 5e-05, "loss": 1.8759, "step": 849000 }, { "epoch": 0.8801458764130224, "grad_norm": 1.440382719039917, "learning_rate": 5e-05, "loss": 1.8863, "step": 850000 }, { "epoch": 0.8801458764130224, "eval_loss": 1.8506907224655151, "eval_runtime": 22.4651, "eval_samples_per_second": 2201.505, "eval_steps_per_second": 8.636, "step": 850000 }, { "epoch": 0.8811813421499789, "grad_norm": 1.3661401271820068, "learning_rate": 5e-05, "loss": 1.8817, "step": 851000 }, { "epoch": 0.8822168078869355, "grad_norm": 1.334959626197815, "learning_rate": 5e-05, "loss": 1.8656, "step": 852000 }, { "epoch": 0.8832522736238919, "grad_norm": 1.2820968627929688, "learning_rate": 5e-05, "loss": 1.8648, "step": 853000 }, { "epoch": 0.8842877393608484, "grad_norm": 1.7478554248809814, "learning_rate": 5e-05, "loss": 1.8513, "step": 854000 }, { "epoch": 0.8853232050978049, "grad_norm": 1.6486462354660034, "learning_rate": 5e-05, "loss": 1.8729, "step": 855000 }, { "epoch": 0.8853232050978049, "eval_loss": 1.8535994291305542, "eval_runtime": 21.8631, "eval_samples_per_second": 2262.122, "eval_steps_per_second": 8.873, "step": 855000 }, { "epoch": 0.8863586708347614, "grad_norm": 1.7058559656143188, "learning_rate": 5e-05, "loss": 1.8646, "step": 856000 }, { "epoch": 0.887394136571718, "grad_norm": 1.849015712738037, "learning_rate": 5e-05, "loss": 1.8641, "step": 857000 }, { "epoch": 0.8884296023086744, "grad_norm": 1.493324637413025, "learning_rate": 5e-05, "loss": 1.8717, "step": 858000 }, { "epoch": 0.8894650680456309, "grad_norm": 1.6665401458740234, "learning_rate": 5e-05, "loss": 1.8663, "step": 859000 }, { "epoch": 0.8905005337825874, "grad_norm": 1.5283894538879395, "learning_rate": 5e-05, "loss": 1.8724, "step": 860000 }, { "epoch": 0.8905005337825874, "eval_loss": 1.8644919395446777, "eval_runtime": 23.2821, "eval_samples_per_second": 2124.25, "eval_steps_per_second": 8.333, "step": 860000 }, { "epoch": 0.8915359995195439, "grad_norm": 1.7026426792144775, "learning_rate": 5e-05, "loss": 1.8713, "step": 861000 }, { "epoch": 0.8925714652565004, "grad_norm": 1.3697137832641602, "learning_rate": 5e-05, "loss": 1.8691, "step": 862000 }, { "epoch": 0.8936069309934569, "grad_norm": 1.3484594821929932, "learning_rate": 5e-05, "loss": 1.8807, "step": 863000 }, { "epoch": 0.8946423967304133, "grad_norm": 1.725483775138855, "learning_rate": 5e-05, "loss": 1.8714, "step": 864000 }, { "epoch": 0.8956778624673699, "grad_norm": 1.7059649229049683, "learning_rate": 5e-05, "loss": 1.8663, "step": 865000 }, { "epoch": 0.8956778624673699, "eval_loss": 1.8639227151870728, "eval_runtime": 22.1701, "eval_samples_per_second": 2230.796, "eval_steps_per_second": 8.751, "step": 865000 }, { "epoch": 0.8967133282043264, "grad_norm": 1.9144644737243652, "learning_rate": 5e-05, "loss": 1.8609, "step": 866000 }, { "epoch": 0.8977487939412829, "grad_norm": 2.074328899383545, "learning_rate": 5e-05, "loss": 1.8749, "step": 867000 }, { "epoch": 0.8987842596782394, "grad_norm": 1.2168279886245728, "learning_rate": 5e-05, "loss": 1.8611, "step": 868000 }, { "epoch": 0.8998197254151958, "grad_norm": 1.6179542541503906, "learning_rate": 5e-05, "loss": 1.8666, "step": 869000 }, { "epoch": 0.9008551911521524, "grad_norm": 1.4882663488388062, "learning_rate": 5e-05, "loss": 1.861, "step": 870000 }, { "epoch": 0.9008551911521524, "eval_loss": 1.8574090003967285, "eval_runtime": 22.0353, "eval_samples_per_second": 2244.441, "eval_steps_per_second": 8.804, "step": 870000 }, { "epoch": 0.9018906568891089, "grad_norm": 1.5473912954330444, "learning_rate": 5e-05, "loss": 1.8784, "step": 871000 }, { "epoch": 0.9029261226260654, "grad_norm": 1.5128304958343506, "learning_rate": 5e-05, "loss": 1.867, "step": 872000 }, { "epoch": 0.9039615883630219, "grad_norm": 1.6824181079864502, "learning_rate": 5e-05, "loss": 1.8605, "step": 873000 }, { "epoch": 0.9049970540999783, "grad_norm": 1.4800105094909668, "learning_rate": 5e-05, "loss": 1.8539, "step": 874000 }, { "epoch": 0.9060325198369349, "grad_norm": 1.7847933769226074, "learning_rate": 5e-05, "loss": 1.8742, "step": 875000 }, { "epoch": 0.9060325198369349, "eval_loss": 1.8582428693771362, "eval_runtime": 22.3155, "eval_samples_per_second": 2216.265, "eval_steps_per_second": 8.694, "step": 875000 }, { "epoch": 0.9070679855738913, "grad_norm": 1.5011705160140991, "learning_rate": 5e-05, "loss": 1.8654, "step": 876000 }, { "epoch": 0.9081034513108478, "grad_norm": 2.146315336227417, "learning_rate": 5e-05, "loss": 1.8602, "step": 877000 }, { "epoch": 0.9091389170478044, "grad_norm": 1.6306260824203491, "learning_rate": 5e-05, "loss": 1.8645, "step": 878000 }, { "epoch": 0.9101743827847608, "grad_norm": 2.065568685531616, "learning_rate": 5e-05, "loss": 1.8591, "step": 879000 }, { "epoch": 0.9112098485217174, "grad_norm": 1.3474091291427612, "learning_rate": 5e-05, "loss": 1.8722, "step": 880000 }, { "epoch": 0.9112098485217174, "eval_loss": 1.8544831275939941, "eval_runtime": 23.1598, "eval_samples_per_second": 2135.47, "eval_steps_per_second": 8.377, "step": 880000 }, { "epoch": 0.9122453142586738, "grad_norm": 1.6332334280014038, "learning_rate": 5e-05, "loss": 1.8727, "step": 881000 }, { "epoch": 0.9132807799956303, "grad_norm": 1.5789443254470825, "learning_rate": 5e-05, "loss": 1.8613, "step": 882000 }, { "epoch": 0.9143162457325869, "grad_norm": 1.859616994857788, "learning_rate": 5e-05, "loss": 1.8518, "step": 883000 }, { "epoch": 0.9153517114695433, "grad_norm": 1.6380046606063843, "learning_rate": 5e-05, "loss": 1.8592, "step": 884000 }, { "epoch": 0.9163871772064999, "grad_norm": 1.8654091358184814, "learning_rate": 5e-05, "loss": 1.853, "step": 885000 }, { "epoch": 0.9163871772064999, "eval_loss": 1.856461763381958, "eval_runtime": 22.3583, "eval_samples_per_second": 2212.021, "eval_steps_per_second": 8.677, "step": 885000 }, { "epoch": 0.9174226429434563, "grad_norm": 1.3017899990081787, "learning_rate": 5e-05, "loss": 1.8688, "step": 886000 }, { "epoch": 0.9184581086804128, "grad_norm": 2.0168769359588623, "learning_rate": 5e-05, "loss": 1.865, "step": 887000 }, { "epoch": 0.9194935744173693, "grad_norm": 1.8421316146850586, "learning_rate": 5e-05, "loss": 1.8549, "step": 888000 }, { "epoch": 0.9205290401543258, "grad_norm": 1.4435760974884033, "learning_rate": 5e-05, "loss": 1.8736, "step": 889000 }, { "epoch": 0.9215645058912824, "grad_norm": 1.6617441177368164, "learning_rate": 5e-05, "loss": 1.8635, "step": 890000 }, { "epoch": 0.9215645058912824, "eval_loss": 1.8535727262496948, "eval_runtime": 21.8604, "eval_samples_per_second": 2262.398, "eval_steps_per_second": 8.874, "step": 890000 }, { "epoch": 0.9225999716282388, "grad_norm": 1.338428258895874, "learning_rate": 5e-05, "loss": 1.8652, "step": 891000 }, { "epoch": 0.9236354373651953, "grad_norm": 1.660236120223999, "learning_rate": 5e-05, "loss": 1.8601, "step": 892000 }, { "epoch": 0.9246709031021518, "grad_norm": 1.5668145418167114, "learning_rate": 5e-05, "loss": 1.8639, "step": 893000 }, { "epoch": 0.9257063688391083, "grad_norm": 1.8961373567581177, "learning_rate": 5e-05, "loss": 1.8649, "step": 894000 }, { "epoch": 0.9267418345760648, "grad_norm": 1.5430703163146973, "learning_rate": 5e-05, "loss": 1.864, "step": 895000 }, { "epoch": 0.9267418345760648, "eval_loss": 1.8523567914962769, "eval_runtime": 20.6865, "eval_samples_per_second": 2390.781, "eval_steps_per_second": 9.378, "step": 895000 }, { "epoch": 0.9277773003130213, "grad_norm": 1.4171661138534546, "learning_rate": 5e-05, "loss": 1.8722, "step": 896000 }, { "epoch": 0.9288127660499778, "grad_norm": 1.6401444673538208, "learning_rate": 5e-05, "loss": 1.8591, "step": 897000 }, { "epoch": 0.9298482317869343, "grad_norm": 1.6377249956130981, "learning_rate": 5e-05, "loss": 1.8618, "step": 898000 }, { "epoch": 0.9308836975238908, "grad_norm": 1.9125347137451172, "learning_rate": 5e-05, "loss": 1.8666, "step": 899000 }, { "epoch": 0.9319191632608472, "grad_norm": 1.4101911783218384, "learning_rate": 5e-05, "loss": 1.8697, "step": 900000 }, { "epoch": 0.9319191632608472, "eval_loss": 1.8492544889450073, "eval_runtime": 22.078, "eval_samples_per_second": 2240.106, "eval_steps_per_second": 8.787, "step": 900000 }, { "epoch": 0.9329546289978038, "grad_norm": 1.3126411437988281, "learning_rate": 5e-05, "loss": 1.8554, "step": 901000 }, { "epoch": 0.9339900947347602, "grad_norm": 1.494831919670105, "learning_rate": 5e-05, "loss": 1.8655, "step": 902000 }, { "epoch": 0.9350255604717168, "grad_norm": 1.6403518915176392, "learning_rate": 5e-05, "loss": 1.8702, "step": 903000 }, { "epoch": 0.9360610262086733, "grad_norm": 1.4031927585601807, "learning_rate": 5e-05, "loss": 1.8737, "step": 904000 }, { "epoch": 0.9370964919456297, "grad_norm": 1.785649061203003, "learning_rate": 5e-05, "loss": 1.8598, "step": 905000 }, { "epoch": 0.9370964919456297, "eval_loss": 1.8533316850662231, "eval_runtime": 21.2318, "eval_samples_per_second": 2329.385, "eval_steps_per_second": 9.137, "step": 905000 }, { "epoch": 0.9381319576825863, "grad_norm": 1.651497721672058, "learning_rate": 5e-05, "loss": 1.8555, "step": 906000 }, { "epoch": 0.9391674234195427, "grad_norm": 1.7172114849090576, "learning_rate": 5e-05, "loss": 1.8621, "step": 907000 }, { "epoch": 0.9402028891564993, "grad_norm": 1.606945276260376, "learning_rate": 5e-05, "loss": 1.8664, "step": 908000 }, { "epoch": 0.9412383548934558, "grad_norm": 1.686049222946167, "learning_rate": 5e-05, "loss": 1.8696, "step": 909000 }, { "epoch": 0.9422738206304122, "grad_norm": 1.7451001405715942, "learning_rate": 5e-05, "loss": 1.8609, "step": 910000 }, { "epoch": 0.9422738206304122, "eval_loss": 1.8576778173446655, "eval_runtime": 33.0236, "eval_samples_per_second": 1497.626, "eval_steps_per_second": 5.875, "step": 910000 }, { "epoch": 0.9433092863673688, "grad_norm": 1.8006882667541504, "learning_rate": 5e-05, "loss": 1.857, "step": 911000 }, { "epoch": 0.9443447521043252, "grad_norm": 1.6737642288208008, "learning_rate": 5e-05, "loss": 1.8548, "step": 912000 }, { "epoch": 0.9453802178412818, "grad_norm": 1.5626418590545654, "learning_rate": 5e-05, "loss": 1.8477, "step": 913000 }, { "epoch": 0.9464156835782382, "grad_norm": 1.6796774864196777, "learning_rate": 5e-05, "loss": 1.8686, "step": 914000 }, { "epoch": 0.9474511493151947, "grad_norm": 1.7748136520385742, "learning_rate": 5e-05, "loss": 1.8657, "step": 915000 }, { "epoch": 0.9474511493151947, "eval_loss": 1.8437752723693848, "eval_runtime": 55.0601, "eval_samples_per_second": 898.237, "eval_steps_per_second": 3.523, "step": 915000 }, { "epoch": 0.9484866150521513, "grad_norm": 1.745834469795227, "learning_rate": 5e-05, "loss": 1.8585, "step": 916000 }, { "epoch": 0.9495220807891077, "grad_norm": 1.5798673629760742, "learning_rate": 5e-05, "loss": 1.8673, "step": 917000 }, { "epoch": 0.9505575465260643, "grad_norm": 1.6271969079971313, "learning_rate": 5e-05, "loss": 1.8554, "step": 918000 }, { "epoch": 0.9515930122630207, "grad_norm": 1.3999176025390625, "learning_rate": 5e-05, "loss": 1.8628, "step": 919000 }, { "epoch": 0.9526284779999772, "grad_norm": 1.6120986938476562, "learning_rate": 5e-05, "loss": 1.8623, "step": 920000 }, { "epoch": 0.9526284779999772, "eval_loss": 1.8485383987426758, "eval_runtime": 19.9677, "eval_samples_per_second": 2476.855, "eval_steps_per_second": 9.716, "step": 920000 }, { "epoch": 0.9536639437369338, "grad_norm": 1.9189943075180054, "learning_rate": 5e-05, "loss": 1.8525, "step": 921000 }, { "epoch": 0.9546994094738902, "grad_norm": 1.4193308353424072, "learning_rate": 5e-05, "loss": 1.8596, "step": 922000 }, { "epoch": 0.9557348752108467, "grad_norm": 1.5809720754623413, "learning_rate": 5e-05, "loss": 1.8618, "step": 923000 }, { "epoch": 0.9567703409478032, "grad_norm": 1.699925184249878, "learning_rate": 5e-05, "loss": 1.8652, "step": 924000 }, { "epoch": 0.9578058066847597, "grad_norm": 1.6282320022583008, "learning_rate": 5e-05, "loss": 1.8619, "step": 925000 }, { "epoch": 0.9578058066847597, "eval_loss": 1.8499970436096191, "eval_runtime": 22.1279, "eval_samples_per_second": 2235.052, "eval_steps_per_second": 8.767, "step": 925000 }, { "epoch": 0.9588412724217162, "grad_norm": 1.5954972505569458, "learning_rate": 5e-05, "loss": 1.8654, "step": 926000 }, { "epoch": 0.9598767381586727, "grad_norm": 1.6539947986602783, "learning_rate": 5e-05, "loss": 1.8646, "step": 927000 }, { "epoch": 0.9609122038956291, "grad_norm": 1.4428025484085083, "learning_rate": 5e-05, "loss": 1.872, "step": 928000 }, { "epoch": 0.9619476696325857, "grad_norm": 1.92341148853302, "learning_rate": 5e-05, "loss": 1.8572, "step": 929000 }, { "epoch": 0.9629831353695422, "grad_norm": 1.6431875228881836, "learning_rate": 5e-05, "loss": 1.8657, "step": 930000 }, { "epoch": 0.9629831353695422, "eval_loss": 1.848019003868103, "eval_runtime": 20.3315, "eval_samples_per_second": 2432.533, "eval_steps_per_second": 9.542, "step": 930000 }, { "epoch": 0.9640186011064987, "grad_norm": 1.5909419059753418, "learning_rate": 5e-05, "loss": 1.847, "step": 931000 }, { "epoch": 0.9650540668434552, "grad_norm": 1.510219693183899, "learning_rate": 5e-05, "loss": 1.8584, "step": 932000 }, { "epoch": 0.9660895325804116, "grad_norm": 1.9107452630996704, "learning_rate": 5e-05, "loss": 1.8653, "step": 933000 }, { "epoch": 0.9671249983173682, "grad_norm": 1.6867653131484985, "learning_rate": 5e-05, "loss": 1.8519, "step": 934000 }, { "epoch": 0.9681604640543247, "grad_norm": 1.296783685684204, "learning_rate": 5e-05, "loss": 1.8547, "step": 935000 }, { "epoch": 0.9681604640543247, "eval_loss": 1.8461804389953613, "eval_runtime": 21.7793, "eval_samples_per_second": 2270.826, "eval_steps_per_second": 8.908, "step": 935000 }, { "epoch": 0.9691959297912812, "grad_norm": 1.8731415271759033, "learning_rate": 5e-05, "loss": 1.8565, "step": 936000 }, { "epoch": 0.9702313955282377, "grad_norm": 1.3044154644012451, "learning_rate": 5e-05, "loss": 1.8545, "step": 937000 }, { "epoch": 0.9712668612651941, "grad_norm": 1.4259470701217651, "learning_rate": 5e-05, "loss": 1.8572, "step": 938000 }, { "epoch": 0.9723023270021507, "grad_norm": 1.9832919836044312, "learning_rate": 5e-05, "loss": 1.8523, "step": 939000 }, { "epoch": 0.9733377927391071, "grad_norm": 1.3519784212112427, "learning_rate": 5e-05, "loss": 1.8587, "step": 940000 }, { "epoch": 0.9733377927391071, "eval_loss": 1.8498455286026, "eval_runtime": 20.9376, "eval_samples_per_second": 2362.116, "eval_steps_per_second": 9.266, "step": 940000 }, { "epoch": 0.9743732584760637, "grad_norm": 1.4580146074295044, "learning_rate": 5e-05, "loss": 1.8483, "step": 941000 }, { "epoch": 0.9754087242130202, "grad_norm": 1.5738118886947632, "learning_rate": 5e-05, "loss": 1.8556, "step": 942000 }, { "epoch": 0.9764441899499766, "grad_norm": 1.5515236854553223, "learning_rate": 5e-05, "loss": 1.8508, "step": 943000 }, { "epoch": 0.9774796556869332, "grad_norm": 2.112576723098755, "learning_rate": 5e-05, "loss": 1.8533, "step": 944000 }, { "epoch": 0.9785151214238896, "grad_norm": 1.3775643110275269, "learning_rate": 5e-05, "loss": 1.8551, "step": 945000 }, { "epoch": 0.9785151214238896, "eval_loss": 1.84758460521698, "eval_runtime": 22.0596, "eval_samples_per_second": 2241.967, "eval_steps_per_second": 8.794, "step": 945000 }, { "epoch": 0.9795505871608461, "grad_norm": 1.6783188581466675, "learning_rate": 5e-05, "loss": 1.8588, "step": 946000 }, { "epoch": 0.9805860528978027, "grad_norm": 1.7327836751937866, "learning_rate": 5e-05, "loss": 1.85, "step": 947000 }, { "epoch": 0.9816215186347591, "grad_norm": 1.8131853342056274, "learning_rate": 5e-05, "loss": 1.8477, "step": 948000 }, { "epoch": 0.9826569843717157, "grad_norm": 1.3955539464950562, "learning_rate": 5e-05, "loss": 1.848, "step": 949000 }, { "epoch": 0.9836924501086721, "grad_norm": 1.5675952434539795, "learning_rate": 5e-05, "loss": 1.8488, "step": 950000 }, { "epoch": 0.9836924501086721, "eval_loss": 1.8464607000350952, "eval_runtime": 22.4257, "eval_samples_per_second": 2205.372, "eval_steps_per_second": 8.651, "step": 950000 }, { "epoch": 0.9847279158456286, "grad_norm": 1.5982122421264648, "learning_rate": 5e-05, "loss": 1.8573, "step": 951000 }, { "epoch": 0.9857633815825851, "grad_norm": 1.4788215160369873, "learning_rate": 5e-05, "loss": 1.8544, "step": 952000 }, { "epoch": 0.9867988473195416, "grad_norm": 1.665727972984314, "learning_rate": 5e-05, "loss": 1.8531, "step": 953000 }, { "epoch": 0.9878343130564982, "grad_norm": 1.7656697034835815, "learning_rate": 5e-05, "loss": 1.8471, "step": 954000 }, { "epoch": 0.9888697787934546, "grad_norm": 1.8535689115524292, "learning_rate": 5e-05, "loss": 1.8412, "step": 955000 }, { "epoch": 0.9888697787934546, "eval_loss": 1.8463835716247559, "eval_runtime": 21.8162, "eval_samples_per_second": 2266.983, "eval_steps_per_second": 8.892, "step": 955000 }, { "epoch": 0.9899052445304111, "grad_norm": 2.037118673324585, "learning_rate": 5e-05, "loss": 1.8575, "step": 956000 }, { "epoch": 0.9909407102673676, "grad_norm": 1.5285453796386719, "learning_rate": 5e-05, "loss": 1.8624, "step": 957000 }, { "epoch": 0.9919761760043241, "grad_norm": 1.7024654150009155, "learning_rate": 5e-05, "loss": 1.8641, "step": 958000 }, { "epoch": 0.9930116417412806, "grad_norm": 1.6414403915405273, "learning_rate": 5e-05, "loss": 1.8433, "step": 959000 }, { "epoch": 0.9940471074782371, "grad_norm": 1.518936276435852, "learning_rate": 5e-05, "loss": 1.857, "step": 960000 }, { "epoch": 0.9940471074782371, "eval_loss": 1.8379032611846924, "eval_runtime": 21.6695, "eval_samples_per_second": 2282.33, "eval_steps_per_second": 8.953, "step": 960000 }, { "epoch": 0.9950825732151936, "grad_norm": 1.7347865104675293, "learning_rate": 5e-05, "loss": 1.8598, "step": 961000 }, { "epoch": 0.9961180389521501, "grad_norm": 1.986828327178955, "learning_rate": 5e-05, "loss": 1.8432, "step": 962000 }, { "epoch": 0.9971535046891066, "grad_norm": 1.5118800401687622, "learning_rate": 5e-05, "loss": 1.8532, "step": 963000 }, { "epoch": 0.9981889704260631, "grad_norm": 1.719777226448059, "learning_rate": 5e-05, "loss": 1.8487, "step": 964000 }, { "epoch": 0.9992244361630196, "grad_norm": 1.7723966836929321, "learning_rate": 5e-05, "loss": 1.8564, "step": 965000 }, { "epoch": 0.9992244361630196, "eval_loss": 1.8406885862350464, "eval_runtime": 21.4295, "eval_samples_per_second": 2307.889, "eval_steps_per_second": 9.053, "step": 965000 } ], "logging_steps": 1000, "max_steps": 965749, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2642475996312345e+20, "train_batch_size": 64, "trial_name": null, "trial_params": null }