{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09991722833155965, "eval_steps": 500, "global_step": 845, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011824524062906468, "grad_norm": 1.7523608207702637, "learning_rate": 0.0, "loss": 0.9062, "num_tokens": 628048.0, "step": 1 }, { "epoch": 0.00023649048125812936, "grad_norm": 1.6680941581726074, "learning_rate": 2.307692307692308e-06, "loss": 0.8713, "num_tokens": 1266689.0, "step": 2 }, { "epoch": 0.000354735721887194, "grad_norm": 1.625159502029419, "learning_rate": 4.615384615384616e-06, "loss": 0.873, "num_tokens": 1900338.0, "step": 3 }, { "epoch": 0.0004729809625162587, "grad_norm": 1.3783349990844727, "learning_rate": 6.923076923076923e-06, "loss": 0.8628, "num_tokens": 2539095.0, "step": 4 }, { "epoch": 0.0005912262031453234, "grad_norm": 1.1593743562698364, "learning_rate": 9.230769230769232e-06, "loss": 0.8533, "num_tokens": 3172740.0, "step": 5 }, { "epoch": 0.000709471443774388, "grad_norm": 1.1372928619384766, "learning_rate": 1.153846153846154e-05, "loss": 0.8015, "num_tokens": 3810090.0, "step": 6 }, { "epoch": 0.0008277166844034528, "grad_norm": 1.4683241844177246, "learning_rate": 1.3846153846153847e-05, "loss": 0.8613, "num_tokens": 4447913.0, "step": 7 }, { "epoch": 0.0009459619250325174, "grad_norm": 1.2824925184249878, "learning_rate": 1.6153846153846154e-05, "loss": 0.7549, "num_tokens": 5085253.0, "step": 8 }, { "epoch": 0.001064207165661582, "grad_norm": 0.9277980923652649, "learning_rate": 1.8461538461538465e-05, "loss": 0.7863, "num_tokens": 5721930.0, "step": 9 }, { "epoch": 0.0011824524062906468, "grad_norm": 1.0162955522537231, "learning_rate": 2.076923076923077e-05, "loss": 0.755, "num_tokens": 6354428.0, "step": 10 }, { "epoch": 0.0013006976469197116, "grad_norm": 0.8888687491416931, "learning_rate": 2.307692307692308e-05, "loss": 0.7388, "num_tokens": 6988898.0, "step": 11 }, { "epoch": 0.001418942887548776, "grad_norm": 0.722545862197876, "learning_rate": 2.5384615384615386e-05, "loss": 0.7017, "num_tokens": 7627058.0, "step": 12 }, { "epoch": 0.0015371881281778408, "grad_norm": 0.7729371786117554, "learning_rate": 2.7692307692307694e-05, "loss": 0.6836, "num_tokens": 8261730.0, "step": 13 }, { "epoch": 0.0016554333688069056, "grad_norm": 0.6504688858985901, "learning_rate": 3e-05, "loss": 0.6866, "num_tokens": 8889249.0, "step": 14 }, { "epoch": 0.0017736786094359701, "grad_norm": 0.6326490640640259, "learning_rate": 3.230769230769231e-05, "loss": 0.6229, "num_tokens": 9524710.0, "step": 15 }, { "epoch": 0.0018919238500650349, "grad_norm": 0.6311523914337158, "learning_rate": 3.461538461538461e-05, "loss": 0.6436, "num_tokens": 10161723.0, "step": 16 }, { "epoch": 0.0020101690906940994, "grad_norm": 2.1880621910095215, "learning_rate": 3.692307692307693e-05, "loss": 0.6368, "num_tokens": 10793622.0, "step": 17 }, { "epoch": 0.002128414331323164, "grad_norm": 0.766629695892334, "learning_rate": 3.923076923076923e-05, "loss": 0.64, "num_tokens": 11420435.0, "step": 18 }, { "epoch": 0.002246659571952229, "grad_norm": 0.6217833161354065, "learning_rate": 4.153846153846154e-05, "loss": 0.6326, "num_tokens": 12055174.0, "step": 19 }, { "epoch": 0.0023649048125812936, "grad_norm": 0.5231024026870728, "learning_rate": 4.384615384615385e-05, "loss": 0.5653, "num_tokens": 12687424.0, "step": 20 }, { "epoch": 0.0024831500532103584, "grad_norm": 0.5611249804496765, "learning_rate": 4.615384615384616e-05, "loss": 0.606, "num_tokens": 13318953.0, "step": 21 }, { "epoch": 0.002601395293839423, "grad_norm": 0.5640860199928284, "learning_rate": 4.846153846153846e-05, "loss": 0.5724, "num_tokens": 13937259.0, "step": 22 }, { "epoch": 0.0027196405344684874, "grad_norm": 0.48454275727272034, "learning_rate": 5.076923076923077e-05, "loss": 0.5994, "num_tokens": 14569053.0, "step": 23 }, { "epoch": 0.002837885775097552, "grad_norm": 0.6201558113098145, "learning_rate": 5.3076923076923076e-05, "loss": 0.6035, "num_tokens": 15199901.0, "step": 24 }, { "epoch": 0.002956131015726617, "grad_norm": 0.773175060749054, "learning_rate": 5.538461538461539e-05, "loss": 0.5975, "num_tokens": 15830276.0, "step": 25 }, { "epoch": 0.0030743762563556817, "grad_norm": 0.5781369209289551, "learning_rate": 5.76923076923077e-05, "loss": 0.5864, "num_tokens": 16465223.0, "step": 26 }, { "epoch": 0.0031926214969847464, "grad_norm": 0.5451337695121765, "learning_rate": 6e-05, "loss": 0.5692, "num_tokens": 17102309.0, "step": 27 }, { "epoch": 0.003310866737613811, "grad_norm": 0.48841163516044617, "learning_rate": 5.9999801360699206e-05, "loss": 0.5736, "num_tokens": 17738787.0, "step": 28 }, { "epoch": 0.003429111978242876, "grad_norm": 0.43556851148605347, "learning_rate": 5.9999205445719606e-05, "loss": 0.5085, "num_tokens": 18370503.0, "step": 29 }, { "epoch": 0.0035473572188719402, "grad_norm": 0.4315873682498932, "learning_rate": 5.999821226382951e-05, "loss": 0.5342, "num_tokens": 19008700.0, "step": 30 }, { "epoch": 0.003665602459501005, "grad_norm": 0.48499879240989685, "learning_rate": 5.99968218296426e-05, "loss": 0.5414, "num_tokens": 19641076.0, "step": 31 }, { "epoch": 0.0037838477001300697, "grad_norm": 0.3707197308540344, "learning_rate": 5.999503416361778e-05, "loss": 0.4694, "num_tokens": 20268470.0, "step": 32 }, { "epoch": 0.0039020929407591345, "grad_norm": 0.4602040946483612, "learning_rate": 5.99928492920588e-05, "loss": 0.545, "num_tokens": 20903791.0, "step": 33 }, { "epoch": 0.004020338181388199, "grad_norm": 0.4377839267253876, "learning_rate": 5.999026724711391e-05, "loss": 0.5273, "num_tokens": 21537889.0, "step": 34 }, { "epoch": 0.0041385834220172635, "grad_norm": 0.3558352291584015, "learning_rate": 5.998728806677537e-05, "loss": 0.4575, "num_tokens": 22169163.0, "step": 35 }, { "epoch": 0.004256828662646328, "grad_norm": 0.4064357280731201, "learning_rate": 5.99839117948789e-05, "loss": 0.5139, "num_tokens": 22802993.0, "step": 36 }, { "epoch": 0.004375073903275393, "grad_norm": 0.40676349401474, "learning_rate": 5.998013848110306e-05, "loss": 0.4923, "num_tokens": 23436910.0, "step": 37 }, { "epoch": 0.004493319143904458, "grad_norm": 0.4407147765159607, "learning_rate": 5.997596818096846e-05, "loss": 0.5295, "num_tokens": 24066237.0, "step": 38 }, { "epoch": 0.0046115643845335225, "grad_norm": 0.42400693893432617, "learning_rate": 5.997140095583699e-05, "loss": 0.4883, "num_tokens": 24702070.0, "step": 39 }, { "epoch": 0.004729809625162587, "grad_norm": 0.36618033051490784, "learning_rate": 5.99664368729109e-05, "loss": 0.4745, "num_tokens": 25335554.0, "step": 40 }, { "epoch": 0.004848054865791652, "grad_norm": 0.32382911443710327, "learning_rate": 5.996107600523183e-05, "loss": 0.4362, "num_tokens": 25967347.0, "step": 41 }, { "epoch": 0.004966300106420717, "grad_norm": 0.39235326647758484, "learning_rate": 5.995531843167969e-05, "loss": 0.4558, "num_tokens": 26599914.0, "step": 42 }, { "epoch": 0.0050845453470497815, "grad_norm": 0.3560352921485901, "learning_rate": 5.9949164236971555e-05, "loss": 0.5103, "num_tokens": 27231412.0, "step": 43 }, { "epoch": 0.005202790587678846, "grad_norm": 0.4623904526233673, "learning_rate": 5.994261351166038e-05, "loss": 0.5168, "num_tokens": 27868949.0, "step": 44 }, { "epoch": 0.005321035828307911, "grad_norm": 0.3559505343437195, "learning_rate": 5.99356663521337e-05, "loss": 0.4952, "num_tokens": 28505673.0, "step": 45 }, { "epoch": 0.005439281068936975, "grad_norm": 0.4067099690437317, "learning_rate": 5.9928322860612126e-05, "loss": 0.5023, "num_tokens": 29143956.0, "step": 46 }, { "epoch": 0.00555752630956604, "grad_norm": 0.40066882967948914, "learning_rate": 5.992058314514801e-05, "loss": 0.5023, "num_tokens": 29780377.0, "step": 47 }, { "epoch": 0.005675771550195104, "grad_norm": 0.3772350251674652, "learning_rate": 5.9912447319623676e-05, "loss": 0.5043, "num_tokens": 30419988.0, "step": 48 }, { "epoch": 0.005794016790824169, "grad_norm": 0.36861610412597656, "learning_rate": 5.9903915503749835e-05, "loss": 0.4962, "num_tokens": 31058641.0, "step": 49 }, { "epoch": 0.005912262031453234, "grad_norm": 0.36637431383132935, "learning_rate": 5.989498782306382e-05, "loss": 0.4995, "num_tokens": 31696113.0, "step": 50 }, { "epoch": 0.006030507272082299, "grad_norm": 0.432847797870636, "learning_rate": 5.9885664408927744e-05, "loss": 0.5389, "num_tokens": 32335026.0, "step": 51 }, { "epoch": 0.006148752512711363, "grad_norm": 0.37941452860832214, "learning_rate": 5.98759453985265e-05, "loss": 0.4635, "num_tokens": 32968147.0, "step": 52 }, { "epoch": 0.006266997753340428, "grad_norm": 0.4316914677619934, "learning_rate": 5.9865830934865846e-05, "loss": 0.5001, "num_tokens": 33599687.0, "step": 53 }, { "epoch": 0.006385242993969493, "grad_norm": 0.36900049448013306, "learning_rate": 5.98553211667702e-05, "loss": 0.4727, "num_tokens": 34235952.0, "step": 54 }, { "epoch": 0.006503488234598558, "grad_norm": 0.39060965180397034, "learning_rate": 5.9844416248880556e-05, "loss": 0.4985, "num_tokens": 34855614.0, "step": 55 }, { "epoch": 0.006621733475227622, "grad_norm": 0.4554467499256134, "learning_rate": 5.983311634165209e-05, "loss": 0.5408, "num_tokens": 35490773.0, "step": 56 }, { "epoch": 0.006739978715856687, "grad_norm": 0.3941882252693176, "learning_rate": 5.982142161135191e-05, "loss": 0.5216, "num_tokens": 36118336.0, "step": 57 }, { "epoch": 0.006858223956485752, "grad_norm": 0.39384809136390686, "learning_rate": 5.9809332230056545e-05, "loss": 0.4929, "num_tokens": 36751911.0, "step": 58 }, { "epoch": 0.0069764691971148166, "grad_norm": 0.38750314712524414, "learning_rate": 5.979684837564939e-05, "loss": 0.4889, "num_tokens": 37375413.0, "step": 59 }, { "epoch": 0.0070947144377438804, "grad_norm": 0.387765496969223, "learning_rate": 5.978397023181817e-05, "loss": 0.4611, "num_tokens": 38006888.0, "step": 60 }, { "epoch": 0.007212959678372945, "grad_norm": 0.36486542224884033, "learning_rate": 5.977069798805219e-05, "loss": 0.4789, "num_tokens": 38640497.0, "step": 61 }, { "epoch": 0.00733120491900201, "grad_norm": 0.32066309452056885, "learning_rate": 5.975703183963953e-05, "loss": 0.4666, "num_tokens": 39273313.0, "step": 62 }, { "epoch": 0.007449450159631075, "grad_norm": 0.4485960304737091, "learning_rate": 5.97429719876642e-05, "loss": 0.5988, "num_tokens": 39910620.0, "step": 63 }, { "epoch": 0.007567695400260139, "grad_norm": 0.45505833625793457, "learning_rate": 5.97285186390032e-05, "loss": 0.4779, "num_tokens": 40546868.0, "step": 64 }, { "epoch": 0.007685940640889204, "grad_norm": 0.3256620168685913, "learning_rate": 5.9713672006323386e-05, "loss": 0.4478, "num_tokens": 41182518.0, "step": 65 }, { "epoch": 0.007804185881518269, "grad_norm": 0.4429851770401001, "learning_rate": 5.969843230807847e-05, "loss": 0.4945, "num_tokens": 41817083.0, "step": 66 }, { "epoch": 0.007922431122147333, "grad_norm": 0.4284612536430359, "learning_rate": 5.96827997685057e-05, "loss": 0.4906, "num_tokens": 42454047.0, "step": 67 }, { "epoch": 0.008040676362776398, "grad_norm": 0.41103261709213257, "learning_rate": 5.966677461762262e-05, "loss": 0.4699, "num_tokens": 43066339.0, "step": 68 }, { "epoch": 0.008158921603405462, "grad_norm": 0.41535407304763794, "learning_rate": 5.965035709122364e-05, "loss": 0.516, "num_tokens": 43703254.0, "step": 69 }, { "epoch": 0.008277166844034527, "grad_norm": 0.41889598965644836, "learning_rate": 5.963354743087664e-05, "loss": 0.5109, "num_tokens": 44339105.0, "step": 70 }, { "epoch": 0.008395412084663592, "grad_norm": 0.37383216619491577, "learning_rate": 5.9616345883919304e-05, "loss": 0.4497, "num_tokens": 44969251.0, "step": 71 }, { "epoch": 0.008513657325292657, "grad_norm": 0.39339086413383484, "learning_rate": 5.9598752703455596e-05, "loss": 0.4967, "num_tokens": 45605957.0, "step": 72 }, { "epoch": 0.008631902565921721, "grad_norm": 0.34155574440956116, "learning_rate": 5.958076814835196e-05, "loss": 0.4478, "num_tokens": 46242216.0, "step": 73 }, { "epoch": 0.008750147806550786, "grad_norm": 0.40994498133659363, "learning_rate": 5.956239248323354e-05, "loss": 0.4974, "num_tokens": 46879736.0, "step": 74 }, { "epoch": 0.00886839304717985, "grad_norm": 0.34924760460853577, "learning_rate": 5.9543625978480276e-05, "loss": 0.4551, "num_tokens": 47508876.0, "step": 75 }, { "epoch": 0.008986638287808916, "grad_norm": 0.3592020869255066, "learning_rate": 5.952446891022294e-05, "loss": 0.4589, "num_tokens": 48148110.0, "step": 76 }, { "epoch": 0.00910488352843798, "grad_norm": 0.3335554003715515, "learning_rate": 5.9504921560339085e-05, "loss": 0.4415, "num_tokens": 48779111.0, "step": 77 }, { "epoch": 0.009223128769067045, "grad_norm": 0.33642348647117615, "learning_rate": 5.948498421644883e-05, "loss": 0.4479, "num_tokens": 49414520.0, "step": 78 }, { "epoch": 0.00934137400969611, "grad_norm": 0.3461398184299469, "learning_rate": 5.9464657171910686e-05, "loss": 0.4697, "num_tokens": 50047364.0, "step": 79 }, { "epoch": 0.009459619250325174, "grad_norm": 0.35207217931747437, "learning_rate": 5.944394072581726e-05, "loss": 0.4365, "num_tokens": 50679909.0, "step": 80 }, { "epoch": 0.00957786449095424, "grad_norm": 0.3526061177253723, "learning_rate": 5.9422835182990794e-05, "loss": 0.447, "num_tokens": 51313449.0, "step": 81 }, { "epoch": 0.009696109731583304, "grad_norm": 0.3391474783420563, "learning_rate": 5.940134085397872e-05, "loss": 0.4642, "num_tokens": 51949695.0, "step": 82 }, { "epoch": 0.009814354972212369, "grad_norm": 0.3502749800682068, "learning_rate": 5.937945805504906e-05, "loss": 0.4723, "num_tokens": 52582348.0, "step": 83 }, { "epoch": 0.009932600212841433, "grad_norm": 0.35535070300102234, "learning_rate": 5.9357187108185826e-05, "loss": 0.4752, "num_tokens": 53211571.0, "step": 84 }, { "epoch": 0.010050845453470498, "grad_norm": 0.3651171922683716, "learning_rate": 5.933452834108421e-05, "loss": 0.4694, "num_tokens": 53846179.0, "step": 85 }, { "epoch": 0.010169090694099563, "grad_norm": 0.32200363278388977, "learning_rate": 5.931148208714582e-05, "loss": 0.4597, "num_tokens": 54478244.0, "step": 86 }, { "epoch": 0.010287335934728628, "grad_norm": 0.35413858294487, "learning_rate": 5.9288048685473756e-05, "loss": 0.4795, "num_tokens": 55113336.0, "step": 87 }, { "epoch": 0.010405581175357692, "grad_norm": 0.28715524077415466, "learning_rate": 5.92642284808676e-05, "loss": 0.4432, "num_tokens": 55750901.0, "step": 88 }, { "epoch": 0.010523826415986757, "grad_norm": 0.3725243806838989, "learning_rate": 5.924002182381839e-05, "loss": 0.5214, "num_tokens": 56387320.0, "step": 89 }, { "epoch": 0.010642071656615822, "grad_norm": 0.3085726499557495, "learning_rate": 5.9215429070503406e-05, "loss": 0.4465, "num_tokens": 57023503.0, "step": 90 }, { "epoch": 0.010760316897244887, "grad_norm": 0.3731476366519928, "learning_rate": 5.9190450582780974e-05, "loss": 0.5066, "num_tokens": 57651196.0, "step": 91 }, { "epoch": 0.01087856213787395, "grad_norm": 0.32896849513053894, "learning_rate": 5.9165086728185106e-05, "loss": 0.4651, "num_tokens": 58290170.0, "step": 92 }, { "epoch": 0.010996807378503014, "grad_norm": 0.29874181747436523, "learning_rate": 5.913933787992013e-05, "loss": 0.4323, "num_tokens": 58929585.0, "step": 93 }, { "epoch": 0.01111505261913208, "grad_norm": 0.3025204539299011, "learning_rate": 5.9113204416855196e-05, "loss": 0.4362, "num_tokens": 59569034.0, "step": 94 }, { "epoch": 0.011233297859761144, "grad_norm": 0.3040831685066223, "learning_rate": 5.908668672351862e-05, "loss": 0.4681, "num_tokens": 60197509.0, "step": 95 }, { "epoch": 0.011351543100390209, "grad_norm": 0.33227190375328064, "learning_rate": 5.9059785190092366e-05, "loss": 0.4445, "num_tokens": 60830564.0, "step": 96 }, { "epoch": 0.011469788341019273, "grad_norm": 0.3173273503780365, "learning_rate": 5.9032500212406184e-05, "loss": 0.4706, "num_tokens": 61466570.0, "step": 97 }, { "epoch": 0.011588033581648338, "grad_norm": 0.3499050438404083, "learning_rate": 5.900483219193184e-05, "loss": 0.4474, "num_tokens": 62100797.0, "step": 98 }, { "epoch": 0.011706278822277403, "grad_norm": 0.29081398248672485, "learning_rate": 5.8976781535777215e-05, "loss": 0.4548, "num_tokens": 62734274.0, "step": 99 }, { "epoch": 0.011824524062906468, "grad_norm": 0.3550204634666443, "learning_rate": 5.894834865668028e-05, "loss": 0.4781, "num_tokens": 63350637.0, "step": 100 }, { "epoch": 0.011942769303535532, "grad_norm": 0.3122808635234833, "learning_rate": 5.891953397300305e-05, "loss": 0.4562, "num_tokens": 63989248.0, "step": 101 }, { "epoch": 0.012061014544164597, "grad_norm": 0.3456708490848541, "learning_rate": 5.889033790872542e-05, "loss": 0.4657, "num_tokens": 64623402.0, "step": 102 }, { "epoch": 0.012179259784793662, "grad_norm": 0.30247852206230164, "learning_rate": 5.886076089343895e-05, "loss": 0.42, "num_tokens": 65263084.0, "step": 103 }, { "epoch": 0.012297505025422727, "grad_norm": 0.34775105118751526, "learning_rate": 5.883080336234049e-05, "loss": 0.4833, "num_tokens": 65895544.0, "step": 104 }, { "epoch": 0.012415750266051791, "grad_norm": 0.35499584674835205, "learning_rate": 5.88004657562258e-05, "loss": 0.4397, "num_tokens": 66526292.0, "step": 105 }, { "epoch": 0.012533995506680856, "grad_norm": 0.29378530383110046, "learning_rate": 5.876974852148312e-05, "loss": 0.455, "num_tokens": 67163008.0, "step": 106 }, { "epoch": 0.012652240747309921, "grad_norm": 0.32384178042411804, "learning_rate": 5.873865211008652e-05, "loss": 0.45, "num_tokens": 67799173.0, "step": 107 }, { "epoch": 0.012770485987938986, "grad_norm": 0.3031487762928009, "learning_rate": 5.870717697958928e-05, "loss": 0.431, "num_tokens": 68433626.0, "step": 108 }, { "epoch": 0.01288873122856805, "grad_norm": 0.3422238230705261, "learning_rate": 5.867532359311718e-05, "loss": 0.462, "num_tokens": 69071597.0, "step": 109 }, { "epoch": 0.013006976469197115, "grad_norm": 0.36208781599998474, "learning_rate": 5.864309241936167e-05, "loss": 0.4841, "num_tokens": 69708272.0, "step": 110 }, { "epoch": 0.01312522170982618, "grad_norm": 0.35731053352355957, "learning_rate": 5.861048393257293e-05, "loss": 0.4707, "num_tokens": 70309426.0, "step": 111 }, { "epoch": 0.013243466950455245, "grad_norm": 0.42830735445022583, "learning_rate": 5.8577498612552985e-05, "loss": 0.4905, "num_tokens": 70946347.0, "step": 112 }, { "epoch": 0.01336171219108431, "grad_norm": 0.33078286051750183, "learning_rate": 5.8544136944648554e-05, "loss": 0.4294, "num_tokens": 71578069.0, "step": 113 }, { "epoch": 0.013479957431713374, "grad_norm": 0.31700757145881653, "learning_rate": 5.851039941974397e-05, "loss": 0.4321, "num_tokens": 72216733.0, "step": 114 }, { "epoch": 0.013598202672342439, "grad_norm": 0.3752131462097168, "learning_rate": 5.8476286534253925e-05, "loss": 0.4585, "num_tokens": 72844928.0, "step": 115 }, { "epoch": 0.013716447912971504, "grad_norm": 0.29715201258659363, "learning_rate": 5.844179879011618e-05, "loss": 0.4574, "num_tokens": 73482837.0, "step": 116 }, { "epoch": 0.013834693153600568, "grad_norm": 0.30245885252952576, "learning_rate": 5.8406936694784165e-05, "loss": 0.4828, "num_tokens": 74118196.0, "step": 117 }, { "epoch": 0.013952938394229633, "grad_norm": 0.29638686776161194, "learning_rate": 5.8371700761219527e-05, "loss": 0.4263, "num_tokens": 74756174.0, "step": 118 }, { "epoch": 0.014071183634858696, "grad_norm": 0.3421514928340912, "learning_rate": 5.833609150788458e-05, "loss": 0.4882, "num_tokens": 75393367.0, "step": 119 }, { "epoch": 0.014189428875487761, "grad_norm": 0.3114563226699829, "learning_rate": 5.830010945873467e-05, "loss": 0.4346, "num_tokens": 76025875.0, "step": 120 }, { "epoch": 0.014307674116116826, "grad_norm": 0.29460081458091736, "learning_rate": 5.826375514321047e-05, "loss": 0.4155, "num_tokens": 76657710.0, "step": 121 }, { "epoch": 0.01442591935674589, "grad_norm": 0.34313178062438965, "learning_rate": 5.8227029096230196e-05, "loss": 0.4563, "num_tokens": 77289318.0, "step": 122 }, { "epoch": 0.014544164597374955, "grad_norm": 0.3677009046077728, "learning_rate": 5.81899318581817e-05, "loss": 0.4417, "num_tokens": 77923482.0, "step": 123 }, { "epoch": 0.01466240983800402, "grad_norm": 0.3275640606880188, "learning_rate": 5.8152463974914595e-05, "loss": 0.4607, "num_tokens": 78551959.0, "step": 124 }, { "epoch": 0.014780655078633085, "grad_norm": 0.37022823095321655, "learning_rate": 5.811462599773214e-05, "loss": 0.4506, "num_tokens": 79181459.0, "step": 125 }, { "epoch": 0.01489890031926215, "grad_norm": 0.31386008858680725, "learning_rate": 5.807641848338316e-05, "loss": 0.4194, "num_tokens": 79816398.0, "step": 126 }, { "epoch": 0.015017145559891214, "grad_norm": 0.3229714632034302, "learning_rate": 5.80378419940539e-05, "loss": 0.4543, "num_tokens": 80451198.0, "step": 127 }, { "epoch": 0.015135390800520279, "grad_norm": 0.33021923899650574, "learning_rate": 5.799889709735966e-05, "loss": 0.4601, "num_tokens": 81087693.0, "step": 128 }, { "epoch": 0.015253636041149344, "grad_norm": 0.2887071371078491, "learning_rate": 5.7959584366336535e-05, "loss": 0.4132, "num_tokens": 81722590.0, "step": 129 }, { "epoch": 0.015371881281778408, "grad_norm": 0.36038845777511597, "learning_rate": 5.7919904379432913e-05, "loss": 0.5152, "num_tokens": 82358511.0, "step": 130 }, { "epoch": 0.015490126522407473, "grad_norm": 0.32132768630981445, "learning_rate": 5.787985772050101e-05, "loss": 0.4346, "num_tokens": 82997292.0, "step": 131 }, { "epoch": 0.015608371763036538, "grad_norm": 0.2985667884349823, "learning_rate": 5.783944497878826e-05, "loss": 0.4244, "num_tokens": 83636002.0, "step": 132 }, { "epoch": 0.0157266170036656, "grad_norm": 0.30603644251823425, "learning_rate": 5.7798666748928636e-05, "loss": 0.4487, "num_tokens": 84266256.0, "step": 133 }, { "epoch": 0.015844862244294666, "grad_norm": 0.3081704378128052, "learning_rate": 5.775752363093394e-05, "loss": 0.4649, "num_tokens": 84901903.0, "step": 134 }, { "epoch": 0.01596310748492373, "grad_norm": 0.29722145199775696, "learning_rate": 5.7716016230184895e-05, "loss": 0.4351, "num_tokens": 85532297.0, "step": 135 }, { "epoch": 0.016081352725552795, "grad_norm": 0.27900344133377075, "learning_rate": 5.767414515742235e-05, "loss": 0.3898, "num_tokens": 86159004.0, "step": 136 }, { "epoch": 0.01619959796618186, "grad_norm": 0.2939743995666504, "learning_rate": 5.7631911028738184e-05, "loss": 0.4395, "num_tokens": 86791668.0, "step": 137 }, { "epoch": 0.016317843206810925, "grad_norm": 0.3190593421459198, "learning_rate": 5.7589314465566326e-05, "loss": 0.4502, "num_tokens": 87415500.0, "step": 138 }, { "epoch": 0.01643608844743999, "grad_norm": 0.29683569073677063, "learning_rate": 5.7546356094673545e-05, "loss": 0.4181, "num_tokens": 88054250.0, "step": 139 }, { "epoch": 0.016554333688069054, "grad_norm": 0.295808345079422, "learning_rate": 5.750303654815026e-05, "loss": 0.4011, "num_tokens": 88683640.0, "step": 140 }, { "epoch": 0.01667257892869812, "grad_norm": 0.33235254883766174, "learning_rate": 5.745935646340125e-05, "loss": 0.4017, "num_tokens": 89322994.0, "step": 141 }, { "epoch": 0.016790824169327184, "grad_norm": 0.30324289202690125, "learning_rate": 5.7415316483136266e-05, "loss": 0.4486, "num_tokens": 89959870.0, "step": 142 }, { "epoch": 0.01690906940995625, "grad_norm": 0.3970086872577667, "learning_rate": 5.737091725536055e-05, "loss": 0.4515, "num_tokens": 90595155.0, "step": 143 }, { "epoch": 0.017027314650585313, "grad_norm": 0.27713295817375183, "learning_rate": 5.732615943336531e-05, "loss": 0.4523, "num_tokens": 91229434.0, "step": 144 }, { "epoch": 0.017145559891214378, "grad_norm": 0.31949537992477417, "learning_rate": 5.7281043675718176e-05, "loss": 0.423, "num_tokens": 91864729.0, "step": 145 }, { "epoch": 0.017263805131843443, "grad_norm": 0.2788122892379761, "learning_rate": 5.7235570646253385e-05, "loss": 0.4037, "num_tokens": 92497696.0, "step": 146 }, { "epoch": 0.017382050372472507, "grad_norm": 0.33565449714660645, "learning_rate": 5.71897410140621e-05, "loss": 0.4794, "num_tokens": 93136961.0, "step": 147 }, { "epoch": 0.017500295613101572, "grad_norm": 0.3093065619468689, "learning_rate": 5.7143555453482564e-05, "loss": 0.46, "num_tokens": 93763389.0, "step": 148 }, { "epoch": 0.017618540853730637, "grad_norm": 0.28062355518341064, "learning_rate": 5.709701464409014e-05, "loss": 0.4594, "num_tokens": 94396681.0, "step": 149 }, { "epoch": 0.0177367860943597, "grad_norm": 0.29357820749282837, "learning_rate": 5.705011927068734e-05, "loss": 0.4611, "num_tokens": 95024975.0, "step": 150 }, { "epoch": 0.017855031334988766, "grad_norm": 0.37621134519577026, "learning_rate": 5.700287002329374e-05, "loss": 0.4681, "num_tokens": 95647926.0, "step": 151 }, { "epoch": 0.01797327657561783, "grad_norm": 0.3109932541847229, "learning_rate": 5.6955267597135795e-05, "loss": 0.4347, "num_tokens": 96284873.0, "step": 152 }, { "epoch": 0.018091521816246896, "grad_norm": 0.33683377504348755, "learning_rate": 5.6907312692636665e-05, "loss": 0.4484, "num_tokens": 96921347.0, "step": 153 }, { "epoch": 0.01820976705687596, "grad_norm": 0.29445359110832214, "learning_rate": 5.6859006015405905e-05, "loss": 0.3997, "num_tokens": 97555490.0, "step": 154 }, { "epoch": 0.018328012297505025, "grad_norm": 0.32711490988731384, "learning_rate": 5.681034827622904e-05, "loss": 0.4153, "num_tokens": 98193055.0, "step": 155 }, { "epoch": 0.01844625753813409, "grad_norm": 0.29570791125297546, "learning_rate": 5.67613401910571e-05, "loss": 0.3944, "num_tokens": 98826897.0, "step": 156 }, { "epoch": 0.018564502778763155, "grad_norm": 0.3205905854701996, "learning_rate": 5.671198248099617e-05, "loss": 0.4673, "num_tokens": 99462013.0, "step": 157 }, { "epoch": 0.01868274801939222, "grad_norm": 0.29417866468429565, "learning_rate": 5.666227587229669e-05, "loss": 0.4771, "num_tokens": 100097628.0, "step": 158 }, { "epoch": 0.018800993260021284, "grad_norm": 0.2989625036716461, "learning_rate": 5.66122210963428e-05, "loss": 0.4152, "num_tokens": 100734556.0, "step": 159 }, { "epoch": 0.01891923850065035, "grad_norm": 0.3053020238876343, "learning_rate": 5.656181888964159e-05, "loss": 0.4606, "num_tokens": 101371427.0, "step": 160 }, { "epoch": 0.019037483741279414, "grad_norm": 0.2914108633995056, "learning_rate": 5.6511069993812255e-05, "loss": 0.4647, "num_tokens": 102008014.0, "step": 161 }, { "epoch": 0.01915572898190848, "grad_norm": 0.31419283151626587, "learning_rate": 5.645997515557518e-05, "loss": 0.4277, "num_tokens": 102647195.0, "step": 162 }, { "epoch": 0.019273974222537543, "grad_norm": 0.25925683975219727, "learning_rate": 5.640853512674095e-05, "loss": 0.4409, "num_tokens": 103272117.0, "step": 163 }, { "epoch": 0.019392219463166608, "grad_norm": 0.29054054617881775, "learning_rate": 5.63567506641993e-05, "loss": 0.4468, "num_tokens": 103911617.0, "step": 164 }, { "epoch": 0.019510464703795673, "grad_norm": 0.2996600270271301, "learning_rate": 5.630462252990796e-05, "loss": 0.4583, "num_tokens": 104546025.0, "step": 165 }, { "epoch": 0.019628709944424737, "grad_norm": 0.26758819818496704, "learning_rate": 5.6252151490881474e-05, "loss": 0.4193, "num_tokens": 105181492.0, "step": 166 }, { "epoch": 0.019746955185053802, "grad_norm": 0.28083500266075134, "learning_rate": 5.6199338319179856e-05, "loss": 0.4166, "num_tokens": 105818707.0, "step": 167 }, { "epoch": 0.019865200425682867, "grad_norm": 0.2543669641017914, "learning_rate": 5.614618379189731e-05, "loss": 0.3928, "num_tokens": 106447672.0, "step": 168 }, { "epoch": 0.01998344566631193, "grad_norm": 0.29574477672576904, "learning_rate": 5.609268869115072e-05, "loss": 0.4303, "num_tokens": 107079280.0, "step": 169 }, { "epoch": 0.020101690906940996, "grad_norm": 0.2757669985294342, "learning_rate": 5.6038853804068205e-05, "loss": 0.4325, "num_tokens": 107716692.0, "step": 170 }, { "epoch": 0.02021993614757006, "grad_norm": 0.3341258764266968, "learning_rate": 5.598467992277748e-05, "loss": 0.4302, "num_tokens": 108346190.0, "step": 171 }, { "epoch": 0.020338181388199126, "grad_norm": 0.2687680423259735, "learning_rate": 5.5930167844394255e-05, "loss": 0.4188, "num_tokens": 108972655.0, "step": 172 }, { "epoch": 0.02045642662882819, "grad_norm": 0.3229896128177643, "learning_rate": 5.587531837101046e-05, "loss": 0.4436, "num_tokens": 109606533.0, "step": 173 }, { "epoch": 0.020574671869457255, "grad_norm": 0.2820740044116974, "learning_rate": 5.582013230968246e-05, "loss": 0.4294, "num_tokens": 110242667.0, "step": 174 }, { "epoch": 0.02069291711008632, "grad_norm": 0.35922062397003174, "learning_rate": 5.5764610472419194e-05, "loss": 0.4835, "num_tokens": 110879342.0, "step": 175 }, { "epoch": 0.020811162350715385, "grad_norm": 0.2997070550918579, "learning_rate": 5.5708753676170236e-05, "loss": 0.4347, "num_tokens": 111515578.0, "step": 176 }, { "epoch": 0.02092940759134445, "grad_norm": 0.2995700240135193, "learning_rate": 5.565256274281369e-05, "loss": 0.395, "num_tokens": 112148074.0, "step": 177 }, { "epoch": 0.021047652831973514, "grad_norm": 0.319938600063324, "learning_rate": 5.5596038499144235e-05, "loss": 0.4813, "num_tokens": 112784825.0, "step": 178 }, { "epoch": 0.02116589807260258, "grad_norm": 0.338448166847229, "learning_rate": 5.5539181776860835e-05, "loss": 0.457, "num_tokens": 113415511.0, "step": 179 }, { "epoch": 0.021284143313231644, "grad_norm": 0.318758487701416, "learning_rate": 5.548199341255457e-05, "loss": 0.4566, "num_tokens": 114014233.0, "step": 180 }, { "epoch": 0.02140238855386071, "grad_norm": 0.322611927986145, "learning_rate": 5.542447424769632e-05, "loss": 0.4384, "num_tokens": 114646091.0, "step": 181 }, { "epoch": 0.021520633794489773, "grad_norm": 0.3043844699859619, "learning_rate": 5.536662512862434e-05, "loss": 0.4125, "num_tokens": 115248849.0, "step": 182 }, { "epoch": 0.021638879035118838, "grad_norm": 0.30535179376602173, "learning_rate": 5.530844690653187e-05, "loss": 0.4083, "num_tokens": 115882858.0, "step": 183 }, { "epoch": 0.0217571242757479, "grad_norm": 0.29622629284858704, "learning_rate": 5.524994043745455e-05, "loss": 0.4424, "num_tokens": 116516321.0, "step": 184 }, { "epoch": 0.021875369516376964, "grad_norm": 0.3178810775279999, "learning_rate": 5.519110658225789e-05, "loss": 0.4187, "num_tokens": 117149980.0, "step": 185 }, { "epoch": 0.02199361475700603, "grad_norm": 0.2797812819480896, "learning_rate": 5.513194620662453e-05, "loss": 0.4033, "num_tokens": 117787055.0, "step": 186 }, { "epoch": 0.022111859997635094, "grad_norm": 0.34582096338272095, "learning_rate": 5.5072460181041565e-05, "loss": 0.4264, "num_tokens": 118414231.0, "step": 187 }, { "epoch": 0.02223010523826416, "grad_norm": 0.3072027266025543, "learning_rate": 5.5012649380787697e-05, "loss": 0.4425, "num_tokens": 119042723.0, "step": 188 }, { "epoch": 0.022348350478893223, "grad_norm": 0.27631890773773193, "learning_rate": 5.495251468592038e-05, "loss": 0.47, "num_tokens": 119680244.0, "step": 189 }, { "epoch": 0.022466595719522288, "grad_norm": 0.39626777172088623, "learning_rate": 5.489205698126284e-05, "loss": 0.4255, "num_tokens": 120319137.0, "step": 190 }, { "epoch": 0.022584840960151353, "grad_norm": 0.28610390424728394, "learning_rate": 5.483127715639111e-05, "loss": 0.4364, "num_tokens": 120954282.0, "step": 191 }, { "epoch": 0.022703086200780417, "grad_norm": 0.4135710597038269, "learning_rate": 5.477017610562086e-05, "loss": 0.4342, "num_tokens": 121589180.0, "step": 192 }, { "epoch": 0.022821331441409482, "grad_norm": 0.3279666304588318, "learning_rate": 5.4708754727994347e-05, "loss": 0.4693, "num_tokens": 122226045.0, "step": 193 }, { "epoch": 0.022939576682038547, "grad_norm": 0.3193162679672241, "learning_rate": 5.4647013927267055e-05, "loss": 0.411, "num_tokens": 122863565.0, "step": 194 }, { "epoch": 0.02305782192266761, "grad_norm": 0.3436163067817688, "learning_rate": 5.4584954611894535e-05, "loss": 0.4065, "num_tokens": 123498631.0, "step": 195 }, { "epoch": 0.023176067163296676, "grad_norm": 0.3175835907459259, "learning_rate": 5.452257769501891e-05, "loss": 0.4343, "num_tokens": 124134670.0, "step": 196 }, { "epoch": 0.02329431240392574, "grad_norm": 0.276996374130249, "learning_rate": 5.445988409445553e-05, "loss": 0.4125, "num_tokens": 124770499.0, "step": 197 }, { "epoch": 0.023412557644554806, "grad_norm": 0.3553844690322876, "learning_rate": 5.4396874732679444e-05, "loss": 0.4659, "num_tokens": 125409234.0, "step": 198 }, { "epoch": 0.02353080288518387, "grad_norm": 0.26136353611946106, "learning_rate": 5.433355053681179e-05, "loss": 0.4354, "num_tokens": 126041885.0, "step": 199 }, { "epoch": 0.023649048125812935, "grad_norm": 0.3091793656349182, "learning_rate": 5.42699124386062e-05, "loss": 0.4539, "num_tokens": 126679673.0, "step": 200 }, { "epoch": 0.023767293366442, "grad_norm": 0.3038508892059326, "learning_rate": 5.420596137443508e-05, "loss": 0.4468, "num_tokens": 127318553.0, "step": 201 }, { "epoch": 0.023885538607071065, "grad_norm": 0.257994145154953, "learning_rate": 5.41416982852758e-05, "loss": 0.4177, "num_tokens": 127957565.0, "step": 202 }, { "epoch": 0.02400378384770013, "grad_norm": 0.3154793381690979, "learning_rate": 5.4077124116696884e-05, "loss": 0.4944, "num_tokens": 128588826.0, "step": 203 }, { "epoch": 0.024122029088329194, "grad_norm": 0.30118247866630554, "learning_rate": 5.401223981884411e-05, "loss": 0.4431, "num_tokens": 129222173.0, "step": 204 }, { "epoch": 0.02424027432895826, "grad_norm": 0.26696497201919556, "learning_rate": 5.3947046346426456e-05, "loss": 0.4586, "num_tokens": 129857385.0, "step": 205 }, { "epoch": 0.024358519569587324, "grad_norm": 0.25432252883911133, "learning_rate": 5.3881544658702133e-05, "loss": 0.3814, "num_tokens": 130486516.0, "step": 206 }, { "epoch": 0.02447676481021639, "grad_norm": 0.27828487753868103, "learning_rate": 5.381573571946445e-05, "loss": 0.4529, "num_tokens": 131117306.0, "step": 207 }, { "epoch": 0.024595010050845453, "grad_norm": 0.29483503103256226, "learning_rate": 5.374962049702759e-05, "loss": 0.4738, "num_tokens": 131749433.0, "step": 208 }, { "epoch": 0.024713255291474518, "grad_norm": 0.2637292742729187, "learning_rate": 5.3683199964212405e-05, "loss": 0.4242, "num_tokens": 132382579.0, "step": 209 }, { "epoch": 0.024831500532103583, "grad_norm": 0.2828076183795929, "learning_rate": 5.3616475098332105e-05, "loss": 0.4374, "num_tokens": 133017061.0, "step": 210 }, { "epoch": 0.024949745772732648, "grad_norm": 0.27759385108947754, "learning_rate": 5.3549446881177853e-05, "loss": 0.4296, "num_tokens": 133645920.0, "step": 211 }, { "epoch": 0.025067991013361712, "grad_norm": 0.26630890369415283, "learning_rate": 5.3482116299004336e-05, "loss": 0.468, "num_tokens": 134277976.0, "step": 212 }, { "epoch": 0.025186236253990777, "grad_norm": 0.24754807353019714, "learning_rate": 5.341448434251522e-05, "loss": 0.4468, "num_tokens": 134913386.0, "step": 213 }, { "epoch": 0.025304481494619842, "grad_norm": 0.27732178568840027, "learning_rate": 5.334655200684864e-05, "loss": 0.4323, "num_tokens": 135544399.0, "step": 214 }, { "epoch": 0.025422726735248907, "grad_norm": 0.30716535449028015, "learning_rate": 5.327832029156247e-05, "loss": 0.441, "num_tokens": 136182707.0, "step": 215 }, { "epoch": 0.02554097197587797, "grad_norm": 0.26287323236465454, "learning_rate": 5.3209790200619726e-05, "loss": 0.436, "num_tokens": 136819793.0, "step": 216 }, { "epoch": 0.025659217216507036, "grad_norm": 0.28410691022872925, "learning_rate": 5.314096274237367e-05, "loss": 0.4414, "num_tokens": 137459203.0, "step": 217 }, { "epoch": 0.0257774624571361, "grad_norm": 0.27251100540161133, "learning_rate": 5.3071838929553065e-05, "loss": 0.4345, "num_tokens": 138086108.0, "step": 218 }, { "epoch": 0.025895707697765166, "grad_norm": 0.24234391748905182, "learning_rate": 5.300241977924722e-05, "loss": 0.4244, "num_tokens": 138717361.0, "step": 219 }, { "epoch": 0.02601395293839423, "grad_norm": 0.31852856278419495, "learning_rate": 5.293270631289107e-05, "loss": 0.408, "num_tokens": 139353768.0, "step": 220 }, { "epoch": 0.026132198179023295, "grad_norm": 0.29865893721580505, "learning_rate": 5.286269955625011e-05, "loss": 0.4701, "num_tokens": 139986012.0, "step": 221 }, { "epoch": 0.02625044341965236, "grad_norm": 0.27321770787239075, "learning_rate": 5.279240053940531e-05, "loss": 0.4059, "num_tokens": 140618557.0, "step": 222 }, { "epoch": 0.026368688660281425, "grad_norm": 0.29831984639167786, "learning_rate": 5.2721810296737984e-05, "loss": 0.3978, "num_tokens": 141253328.0, "step": 223 }, { "epoch": 0.02648693390091049, "grad_norm": 0.3366415798664093, "learning_rate": 5.265092986691453e-05, "loss": 0.4354, "num_tokens": 141885327.0, "step": 224 }, { "epoch": 0.026605179141539554, "grad_norm": 0.27974531054496765, "learning_rate": 5.257976029287117e-05, "loss": 0.4497, "num_tokens": 142518760.0, "step": 225 }, { "epoch": 0.02672342438216862, "grad_norm": 0.31790000200271606, "learning_rate": 5.250830262179859e-05, "loss": 0.4561, "num_tokens": 143154500.0, "step": 226 }, { "epoch": 0.026841669622797684, "grad_norm": 0.2560494840145111, "learning_rate": 5.243655790512659e-05, "loss": 0.4402, "num_tokens": 143792062.0, "step": 227 }, { "epoch": 0.026959914863426748, "grad_norm": 0.27620622515678406, "learning_rate": 5.236452719850849e-05, "loss": 0.3912, "num_tokens": 144424810.0, "step": 228 }, { "epoch": 0.027078160104055813, "grad_norm": 0.2581166625022888, "learning_rate": 5.2292211561805726e-05, "loss": 0.4145, "num_tokens": 145021445.0, "step": 229 }, { "epoch": 0.027196405344684878, "grad_norm": 0.297852486371994, "learning_rate": 5.2219612059072196e-05, "loss": 0.4675, "num_tokens": 145656556.0, "step": 230 }, { "epoch": 0.027314650585313942, "grad_norm": 0.286258339881897, "learning_rate": 5.214672975853859e-05, "loss": 0.413, "num_tokens": 146293020.0, "step": 231 }, { "epoch": 0.027432895825943007, "grad_norm": 0.2543971538543701, "learning_rate": 5.207356573259671e-05, "loss": 0.4335, "num_tokens": 146922200.0, "step": 232 }, { "epoch": 0.027551141066572072, "grad_norm": 0.29354169964790344, "learning_rate": 5.2000121057783674e-05, "loss": 0.4786, "num_tokens": 147560483.0, "step": 233 }, { "epoch": 0.027669386307201137, "grad_norm": 0.22866986691951752, "learning_rate": 5.1926396814766034e-05, "loss": 0.4198, "num_tokens": 148198475.0, "step": 234 }, { "epoch": 0.0277876315478302, "grad_norm": 0.2605131268501282, "learning_rate": 5.185239408832397e-05, "loss": 0.4363, "num_tokens": 148811827.0, "step": 235 }, { "epoch": 0.027905876788459266, "grad_norm": 0.22731252014636993, "learning_rate": 5.177811396733523e-05, "loss": 0.4034, "num_tokens": 149446588.0, "step": 236 }, { "epoch": 0.028024122029088328, "grad_norm": 0.23291230201721191, "learning_rate": 5.170355754475919e-05, "loss": 0.3862, "num_tokens": 150080880.0, "step": 237 }, { "epoch": 0.028142367269717392, "grad_norm": 0.2324601113796234, "learning_rate": 5.162872591762069e-05, "loss": 0.4557, "num_tokens": 150720517.0, "step": 238 }, { "epoch": 0.028260612510346457, "grad_norm": 0.2646247148513794, "learning_rate": 5.155362018699396e-05, "loss": 0.4241, "num_tokens": 151354865.0, "step": 239 }, { "epoch": 0.028378857750975522, "grad_norm": 0.2472905069589615, "learning_rate": 5.147824145798643e-05, "loss": 0.3896, "num_tokens": 151989302.0, "step": 240 }, { "epoch": 0.028497102991604586, "grad_norm": 0.27559757232666016, "learning_rate": 5.1402590839722356e-05, "loss": 0.4254, "num_tokens": 152622644.0, "step": 241 }, { "epoch": 0.02861534823223365, "grad_norm": 0.2577532231807709, "learning_rate": 5.132666944532664e-05, "loss": 0.4598, "num_tokens": 153254978.0, "step": 242 }, { "epoch": 0.028733593472862716, "grad_norm": 0.26456958055496216, "learning_rate": 5.125047839190837e-05, "loss": 0.4006, "num_tokens": 153888439.0, "step": 243 }, { "epoch": 0.02885183871349178, "grad_norm": 0.23455888032913208, "learning_rate": 5.1174018800544395e-05, "loss": 0.377, "num_tokens": 154521234.0, "step": 244 }, { "epoch": 0.028970083954120845, "grad_norm": 0.2502966821193695, "learning_rate": 5.1097291796262854e-05, "loss": 0.4257, "num_tokens": 155156546.0, "step": 245 }, { "epoch": 0.02908832919474991, "grad_norm": 0.26023155450820923, "learning_rate": 5.10202985080266e-05, "loss": 0.4268, "num_tokens": 155791974.0, "step": 246 }, { "epoch": 0.029206574435378975, "grad_norm": 0.23456987738609314, "learning_rate": 5.0943040068716584e-05, "loss": 0.385, "num_tokens": 156417737.0, "step": 247 }, { "epoch": 0.02932481967600804, "grad_norm": 0.23901493847370148, "learning_rate": 5.086551761511521e-05, "loss": 0.4553, "num_tokens": 157054324.0, "step": 248 }, { "epoch": 0.029443064916637104, "grad_norm": 0.26856529712677, "learning_rate": 5.0787732287889574e-05, "loss": 0.4435, "num_tokens": 157686875.0, "step": 249 }, { "epoch": 0.02956131015726617, "grad_norm": 0.28496497869491577, "learning_rate": 5.070968523157474e-05, "loss": 0.4373, "num_tokens": 158318798.0, "step": 250 }, { "epoch": 0.029679555397895234, "grad_norm": 0.2572629451751709, "learning_rate": 5.0631377594556795e-05, "loss": 0.451, "num_tokens": 158956587.0, "step": 251 }, { "epoch": 0.0297978006385243, "grad_norm": 0.25811442732810974, "learning_rate": 5.05528105290561e-05, "loss": 0.3716, "num_tokens": 159591859.0, "step": 252 }, { "epoch": 0.029916045879153363, "grad_norm": 0.266215980052948, "learning_rate": 5.047398519111017e-05, "loss": 0.4106, "num_tokens": 160224798.0, "step": 253 }, { "epoch": 0.030034291119782428, "grad_norm": 0.2669126093387604, "learning_rate": 5.0394902740556806e-05, "loss": 0.4158, "num_tokens": 160855622.0, "step": 254 }, { "epoch": 0.030152536360411493, "grad_norm": 0.27752405405044556, "learning_rate": 5.031556434101694e-05, "loss": 0.3848, "num_tokens": 161489536.0, "step": 255 }, { "epoch": 0.030270781601040558, "grad_norm": 0.2833244502544403, "learning_rate": 5.023597115987755e-05, "loss": 0.4691, "num_tokens": 162123541.0, "step": 256 }, { "epoch": 0.030389026841669622, "grad_norm": 0.23394179344177246, "learning_rate": 5.0156124368274474e-05, "loss": 0.4263, "num_tokens": 162754194.0, "step": 257 }, { "epoch": 0.030507272082298687, "grad_norm": 0.2544839680194855, "learning_rate": 5.007602514107518e-05, "loss": 0.4125, "num_tokens": 163389806.0, "step": 258 }, { "epoch": 0.030625517322927752, "grad_norm": 0.23980510234832764, "learning_rate": 4.99956746568615e-05, "loss": 0.4008, "num_tokens": 164022489.0, "step": 259 }, { "epoch": 0.030743762563556817, "grad_norm": 0.2352251559495926, "learning_rate": 4.991507409791223e-05, "loss": 0.4105, "num_tokens": 164655264.0, "step": 260 }, { "epoch": 0.03086200780418588, "grad_norm": 0.23630301654338837, "learning_rate": 4.983422465018581e-05, "loss": 0.4021, "num_tokens": 165293663.0, "step": 261 }, { "epoch": 0.030980253044814946, "grad_norm": 0.25700318813323975, "learning_rate": 4.975312750330279e-05, "loss": 0.4257, "num_tokens": 165930344.0, "step": 262 }, { "epoch": 0.03109849828544401, "grad_norm": 0.22052328288555145, "learning_rate": 4.967178385052841e-05, "loss": 0.3837, "num_tokens": 166565332.0, "step": 263 }, { "epoch": 0.031216743526073076, "grad_norm": 0.25492745637893677, "learning_rate": 4.959019488875499e-05, "loss": 0.4271, "num_tokens": 167200224.0, "step": 264 }, { "epoch": 0.03133498876670214, "grad_norm": 0.23811548948287964, "learning_rate": 4.9508361818484334e-05, "loss": 0.4191, "num_tokens": 167832144.0, "step": 265 }, { "epoch": 0.0314532340073312, "grad_norm": 0.22468101978302002, "learning_rate": 4.9426285843810045e-05, "loss": 0.3999, "num_tokens": 168469994.0, "step": 266 }, { "epoch": 0.03157147924796027, "grad_norm": 0.2614614963531494, "learning_rate": 4.934396817239986e-05, "loss": 0.4342, "num_tokens": 169106098.0, "step": 267 }, { "epoch": 0.03168972448858933, "grad_norm": 0.22566261887550354, "learning_rate": 4.926141001547783e-05, "loss": 0.3746, "num_tokens": 169737993.0, "step": 268 }, { "epoch": 0.0318079697292184, "grad_norm": 0.2282998412847519, "learning_rate": 4.91786125878065e-05, "loss": 0.3858, "num_tokens": 170368812.0, "step": 269 }, { "epoch": 0.03192621496984746, "grad_norm": 0.25831902027130127, "learning_rate": 4.9095577107669084e-05, "loss": 0.4595, "num_tokens": 171006424.0, "step": 270 }, { "epoch": 0.03204446021047653, "grad_norm": 0.22491995990276337, "learning_rate": 4.9012304796851486e-05, "loss": 0.4136, "num_tokens": 171645721.0, "step": 271 }, { "epoch": 0.03216270545110559, "grad_norm": 0.25414589047431946, "learning_rate": 4.892879688062432e-05, "loss": 0.4001, "num_tokens": 172281075.0, "step": 272 }, { "epoch": 0.03228095069173466, "grad_norm": 0.22207149863243103, "learning_rate": 4.884505458772495e-05, "loss": 0.3639, "num_tokens": 172914063.0, "step": 273 }, { "epoch": 0.03239919593236372, "grad_norm": 0.23464854061603546, "learning_rate": 4.876107915033933e-05, "loss": 0.4264, "num_tokens": 173548325.0, "step": 274 }, { "epoch": 0.03251744117299279, "grad_norm": 0.26920729875564575, "learning_rate": 4.867687180408392e-05, "loss": 0.4248, "num_tokens": 174183975.0, "step": 275 }, { "epoch": 0.03263568641362185, "grad_norm": 0.22815345227718353, "learning_rate": 4.859243378798748e-05, "loss": 0.398, "num_tokens": 174818549.0, "step": 276 }, { "epoch": 0.03275393165425092, "grad_norm": 0.232111856341362, "learning_rate": 4.850776634447287e-05, "loss": 0.3862, "num_tokens": 175451113.0, "step": 277 }, { "epoch": 0.03287217689487998, "grad_norm": 0.27156439423561096, "learning_rate": 4.842287071933874e-05, "loss": 0.433, "num_tokens": 176087116.0, "step": 278 }, { "epoch": 0.03299042213550905, "grad_norm": 0.2743763029575348, "learning_rate": 4.8337748161741207e-05, "loss": 0.4497, "num_tokens": 176724483.0, "step": 279 }, { "epoch": 0.03310866737613811, "grad_norm": 0.26658013463020325, "learning_rate": 4.825239992417548e-05, "loss": 0.4255, "num_tokens": 177361164.0, "step": 280 }, { "epoch": 0.033226912616767176, "grad_norm": 0.2353833168745041, "learning_rate": 4.8166827262457436e-05, "loss": 0.3786, "num_tokens": 177999098.0, "step": 281 }, { "epoch": 0.03334515785739624, "grad_norm": 0.26090359687805176, "learning_rate": 4.808103143570511e-05, "loss": 0.4224, "num_tokens": 178627820.0, "step": 282 }, { "epoch": 0.033463403098025306, "grad_norm": 0.23582051694393158, "learning_rate": 4.7995013706320215e-05, "loss": 0.4088, "num_tokens": 179259176.0, "step": 283 }, { "epoch": 0.03358164833865437, "grad_norm": 0.26351359486579895, "learning_rate": 4.790877533996955e-05, "loss": 0.4279, "num_tokens": 179890905.0, "step": 284 }, { "epoch": 0.033699893579283435, "grad_norm": 0.25399163365364075, "learning_rate": 4.7822317605566335e-05, "loss": 0.4169, "num_tokens": 180518445.0, "step": 285 }, { "epoch": 0.0338181388199125, "grad_norm": 0.2980181872844696, "learning_rate": 4.7735641775251624e-05, "loss": 0.449, "num_tokens": 181154667.0, "step": 286 }, { "epoch": 0.033936384060541565, "grad_norm": 0.27028796076774597, "learning_rate": 4.764874912437551e-05, "loss": 0.4321, "num_tokens": 181789184.0, "step": 287 }, { "epoch": 0.034054629301170626, "grad_norm": 0.2491423338651657, "learning_rate": 4.756164093147838e-05, "loss": 0.4155, "num_tokens": 182421462.0, "step": 288 }, { "epoch": 0.034172874541799694, "grad_norm": 0.3009137213230133, "learning_rate": 4.747431847827214e-05, "loss": 0.4015, "num_tokens": 183056216.0, "step": 289 }, { "epoch": 0.034291119782428756, "grad_norm": 0.2448507696390152, "learning_rate": 4.73867830496213e-05, "loss": 0.4331, "num_tokens": 183695586.0, "step": 290 }, { "epoch": 0.034409365023057824, "grad_norm": 0.2819685935974121, "learning_rate": 4.729903593352412e-05, "loss": 0.4017, "num_tokens": 184323170.0, "step": 291 }, { "epoch": 0.034527610263686885, "grad_norm": 0.3014317750930786, "learning_rate": 4.721107842109362e-05, "loss": 0.4771, "num_tokens": 184923402.0, "step": 292 }, { "epoch": 0.03464585550431595, "grad_norm": 0.23289276659488678, "learning_rate": 4.712291180653859e-05, "loss": 0.4004, "num_tokens": 185562179.0, "step": 293 }, { "epoch": 0.034764100744945015, "grad_norm": 0.25156062841415405, "learning_rate": 4.703453738714457e-05, "loss": 0.4127, "num_tokens": 186196488.0, "step": 294 }, { "epoch": 0.03488234598557408, "grad_norm": 0.27437835931777954, "learning_rate": 4.6945956463254733e-05, "loss": 0.4458, "num_tokens": 186795333.0, "step": 295 }, { "epoch": 0.035000591226203144, "grad_norm": 0.2655051052570343, "learning_rate": 4.6857170338250756e-05, "loss": 0.3878, "num_tokens": 187431540.0, "step": 296 }, { "epoch": 0.03511883646683221, "grad_norm": 0.24947364628314972, "learning_rate": 4.676818031853367e-05, "loss": 0.4086, "num_tokens": 188067882.0, "step": 297 }, { "epoch": 0.035237081707461274, "grad_norm": 0.27399611473083496, "learning_rate": 4.667898771350461e-05, "loss": 0.4469, "num_tokens": 188704706.0, "step": 298 }, { "epoch": 0.03535532694809034, "grad_norm": 0.23381806910037994, "learning_rate": 4.658959383554554e-05, "loss": 0.3872, "num_tokens": 189339944.0, "step": 299 }, { "epoch": 0.0354735721887194, "grad_norm": 0.30683842301368713, "learning_rate": 4.6500000000000005e-05, "loss": 0.4722, "num_tokens": 189977157.0, "step": 300 }, { "epoch": 0.03559181742934847, "grad_norm": 0.23590795695781708, "learning_rate": 4.641020752515366e-05, "loss": 0.4177, "num_tokens": 190586411.0, "step": 301 }, { "epoch": 0.03571006266997753, "grad_norm": 0.2523725926876068, "learning_rate": 4.632021773221499e-05, "loss": 0.4323, "num_tokens": 191219345.0, "step": 302 }, { "epoch": 0.0358283079106066, "grad_norm": 0.24050471186637878, "learning_rate": 4.623003194529583e-05, "loss": 0.4244, "num_tokens": 191855183.0, "step": 303 }, { "epoch": 0.03594655315123566, "grad_norm": 0.24300076067447662, "learning_rate": 4.613965149139185e-05, "loss": 0.3956, "num_tokens": 192485493.0, "step": 304 }, { "epoch": 0.03606479839186473, "grad_norm": 0.2315610945224762, "learning_rate": 4.6049077700363056e-05, "loss": 0.3896, "num_tokens": 193122797.0, "step": 305 }, { "epoch": 0.03618304363249379, "grad_norm": 0.25560230016708374, "learning_rate": 4.595831190491424e-05, "loss": 0.4167, "num_tokens": 193759752.0, "step": 306 }, { "epoch": 0.03630128887312286, "grad_norm": 0.25288936495780945, "learning_rate": 4.586735544057531e-05, "loss": 0.4087, "num_tokens": 194394288.0, "step": 307 }, { "epoch": 0.03641953411375192, "grad_norm": 0.2969334125518799, "learning_rate": 4.5776209645681745e-05, "loss": 0.4075, "num_tokens": 195027778.0, "step": 308 }, { "epoch": 0.03653777935438099, "grad_norm": 0.22655892372131348, "learning_rate": 4.568487586135478e-05, "loss": 0.3378, "num_tokens": 195660013.0, "step": 309 }, { "epoch": 0.03665602459501005, "grad_norm": 0.28944021463394165, "learning_rate": 4.5593355431481754e-05, "loss": 0.4249, "num_tokens": 196290478.0, "step": 310 }, { "epoch": 0.03677426983563912, "grad_norm": 0.23864449560642242, "learning_rate": 4.550164970269633e-05, "loss": 0.4412, "num_tokens": 196927060.0, "step": 311 }, { "epoch": 0.03689251507626818, "grad_norm": 0.247343510389328, "learning_rate": 4.540976002435862e-05, "loss": 0.4384, "num_tokens": 197557085.0, "step": 312 }, { "epoch": 0.03701076031689725, "grad_norm": 0.2885189950466156, "learning_rate": 4.53176877485354e-05, "loss": 0.4252, "num_tokens": 198189535.0, "step": 313 }, { "epoch": 0.03712900555752631, "grad_norm": 0.2791072428226471, "learning_rate": 4.5225434229980215e-05, "loss": 0.4425, "num_tokens": 198820737.0, "step": 314 }, { "epoch": 0.03724725079815538, "grad_norm": 0.2613127529621124, "learning_rate": 4.513300082611336e-05, "loss": 0.3994, "num_tokens": 199451792.0, "step": 315 }, { "epoch": 0.03736549603878444, "grad_norm": 0.2581581473350525, "learning_rate": 4.504038889700201e-05, "loss": 0.4052, "num_tokens": 200086012.0, "step": 316 }, { "epoch": 0.0374837412794135, "grad_norm": 0.25737565755844116, "learning_rate": 4.494759980534017e-05, "loss": 0.3975, "num_tokens": 200723155.0, "step": 317 }, { "epoch": 0.03760198652004257, "grad_norm": 0.2575814127922058, "learning_rate": 4.4854634916428583e-05, "loss": 0.4188, "num_tokens": 201362056.0, "step": 318 }, { "epoch": 0.03772023176067163, "grad_norm": 0.24522624909877777, "learning_rate": 4.4761495598154706e-05, "loss": 0.4012, "num_tokens": 201996006.0, "step": 319 }, { "epoch": 0.0378384770013007, "grad_norm": 0.2399868369102478, "learning_rate": 4.466818322097253e-05, "loss": 0.3726, "num_tokens": 202591057.0, "step": 320 }, { "epoch": 0.03795672224192976, "grad_norm": 0.23226316273212433, "learning_rate": 4.4574699157882465e-05, "loss": 0.3846, "num_tokens": 203228812.0, "step": 321 }, { "epoch": 0.03807496748255883, "grad_norm": 0.263351172208786, "learning_rate": 4.44810447844111e-05, "loss": 0.4168, "num_tokens": 203868294.0, "step": 322 }, { "epoch": 0.03819321272318789, "grad_norm": 0.2123018354177475, "learning_rate": 4.438722147859095e-05, "loss": 0.3815, "num_tokens": 204499481.0, "step": 323 }, { "epoch": 0.03831145796381696, "grad_norm": 0.2778543531894684, "learning_rate": 4.429323062094026e-05, "loss": 0.3969, "num_tokens": 205133494.0, "step": 324 }, { "epoch": 0.03842970320444602, "grad_norm": 0.2408173829317093, "learning_rate": 4.419907359444259e-05, "loss": 0.4108, "num_tokens": 205767024.0, "step": 325 }, { "epoch": 0.038547948445075086, "grad_norm": 0.26782068610191345, "learning_rate": 4.410475178452652e-05, "loss": 0.4291, "num_tokens": 206400825.0, "step": 326 }, { "epoch": 0.03866619368570415, "grad_norm": 0.26312699913978577, "learning_rate": 4.4010266579045256e-05, "loss": 0.4136, "num_tokens": 207040239.0, "step": 327 }, { "epoch": 0.038784438926333216, "grad_norm": 0.256391704082489, "learning_rate": 4.391561936825623e-05, "loss": 0.3959, "num_tokens": 207676732.0, "step": 328 }, { "epoch": 0.03890268416696228, "grad_norm": 0.2285778969526291, "learning_rate": 4.3820811544800617e-05, "loss": 0.3881, "num_tokens": 208313021.0, "step": 329 }, { "epoch": 0.039020929407591345, "grad_norm": 0.2927227318286896, "learning_rate": 4.372584450368283e-05, "loss": 0.4485, "num_tokens": 208946344.0, "step": 330 }, { "epoch": 0.03913917464822041, "grad_norm": 0.25876858830451965, "learning_rate": 4.3630719642250034e-05, "loss": 0.4692, "num_tokens": 209577542.0, "step": 331 }, { "epoch": 0.039257419888849475, "grad_norm": 0.2661622166633606, "learning_rate": 4.3535438360171556e-05, "loss": 0.4608, "num_tokens": 210213046.0, "step": 332 }, { "epoch": 0.039375665129478536, "grad_norm": 0.2588401436805725, "learning_rate": 4.344000205941831e-05, "loss": 0.4155, "num_tokens": 210848130.0, "step": 333 }, { "epoch": 0.039493910370107604, "grad_norm": 0.25796785950660706, "learning_rate": 4.3344412144242146e-05, "loss": 0.4037, "num_tokens": 211482121.0, "step": 334 }, { "epoch": 0.039612155610736666, "grad_norm": 0.2662915587425232, "learning_rate": 4.3248670021155206e-05, "loss": 0.4512, "num_tokens": 212120668.0, "step": 335 }, { "epoch": 0.039730400851365734, "grad_norm": 0.24322502315044403, "learning_rate": 4.315277709890922e-05, "loss": 0.4174, "num_tokens": 212756102.0, "step": 336 }, { "epoch": 0.039848646091994795, "grad_norm": 0.2540619671344757, "learning_rate": 4.3056734788474785e-05, "loss": 0.4436, "num_tokens": 213392130.0, "step": 337 }, { "epoch": 0.03996689133262386, "grad_norm": 0.24154382944107056, "learning_rate": 4.29605445030206e-05, "loss": 0.3931, "num_tokens": 214021365.0, "step": 338 }, { "epoch": 0.040085136573252925, "grad_norm": 0.24840545654296875, "learning_rate": 4.286420765789267e-05, "loss": 0.4088, "num_tokens": 214651340.0, "step": 339 }, { "epoch": 0.04020338181388199, "grad_norm": 0.2844981551170349, "learning_rate": 4.276772567059347e-05, "loss": 0.4351, "num_tokens": 215284267.0, "step": 340 }, { "epoch": 0.040321627054511054, "grad_norm": 0.2580728232860565, "learning_rate": 4.2671099960761116e-05, "loss": 0.4454, "num_tokens": 215920647.0, "step": 341 }, { "epoch": 0.04043987229514012, "grad_norm": 0.2855488061904907, "learning_rate": 4.257433195014846e-05, "loss": 0.3805, "num_tokens": 216555518.0, "step": 342 }, { "epoch": 0.040558117535769184, "grad_norm": 0.21848393976688385, "learning_rate": 4.247742306260217e-05, "loss": 0.3795, "num_tokens": 217191272.0, "step": 343 }, { "epoch": 0.04067636277639825, "grad_norm": 0.264885276556015, "learning_rate": 4.238037472404176e-05, "loss": 0.4108, "num_tokens": 217824700.0, "step": 344 }, { "epoch": 0.04079460801702731, "grad_norm": 0.2161663919687271, "learning_rate": 4.228318836243865e-05, "loss": 0.3729, "num_tokens": 218455560.0, "step": 345 }, { "epoch": 0.04091285325765638, "grad_norm": 0.22689329087734222, "learning_rate": 4.218586540779515e-05, "loss": 0.421, "num_tokens": 219091298.0, "step": 346 }, { "epoch": 0.04103109849828544, "grad_norm": 0.24377533793449402, "learning_rate": 4.208840729212337e-05, "loss": 0.3951, "num_tokens": 219727733.0, "step": 347 }, { "epoch": 0.04114934373891451, "grad_norm": 0.24370762705802917, "learning_rate": 4.199081544942418e-05, "loss": 0.4481, "num_tokens": 220360695.0, "step": 348 }, { "epoch": 0.04126758897954357, "grad_norm": 0.23610427975654602, "learning_rate": 4.189309131566615e-05, "loss": 0.4373, "num_tokens": 220993405.0, "step": 349 }, { "epoch": 0.04138583422017264, "grad_norm": 0.2471226155757904, "learning_rate": 4.1795236328764354e-05, "loss": 0.4307, "num_tokens": 221619384.0, "step": 350 }, { "epoch": 0.0415040794608017, "grad_norm": 0.2555200159549713, "learning_rate": 4.169725192855925e-05, "loss": 0.4149, "num_tokens": 222250253.0, "step": 351 }, { "epoch": 0.04162232470143077, "grad_norm": 0.26108643412590027, "learning_rate": 4.159913955679548e-05, "loss": 0.4016, "num_tokens": 222884935.0, "step": 352 }, { "epoch": 0.04174056994205983, "grad_norm": 0.22140191495418549, "learning_rate": 4.150090065710067e-05, "loss": 0.4025, "num_tokens": 223516629.0, "step": 353 }, { "epoch": 0.0418588151826889, "grad_norm": 0.2396477907896042, "learning_rate": 4.1402536674964195e-05, "loss": 0.4105, "num_tokens": 224150031.0, "step": 354 }, { "epoch": 0.04197706042331796, "grad_norm": 0.23356612026691437, "learning_rate": 4.130404905771586e-05, "loss": 0.3962, "num_tokens": 224786071.0, "step": 355 }, { "epoch": 0.04209530566394703, "grad_norm": 0.2547277510166168, "learning_rate": 4.1205439254504666e-05, "loss": 0.4314, "num_tokens": 225421240.0, "step": 356 }, { "epoch": 0.04221355090457609, "grad_norm": 0.2576862871646881, "learning_rate": 4.110670871627745e-05, "loss": 0.396, "num_tokens": 226052174.0, "step": 357 }, { "epoch": 0.04233179614520516, "grad_norm": 0.22883984446525574, "learning_rate": 4.100785889575757e-05, "loss": 0.4374, "num_tokens": 226689398.0, "step": 358 }, { "epoch": 0.04245004138583422, "grad_norm": 0.23827779293060303, "learning_rate": 4.090889124742346e-05, "loss": 0.4014, "num_tokens": 227327616.0, "step": 359 }, { "epoch": 0.04256828662646329, "grad_norm": 0.22566570341587067, "learning_rate": 4.080980722748733e-05, "loss": 0.4054, "num_tokens": 227952686.0, "step": 360 }, { "epoch": 0.04268653186709235, "grad_norm": 0.2515687644481659, "learning_rate": 4.0710608293873634e-05, "loss": 0.4194, "num_tokens": 228587586.0, "step": 361 }, { "epoch": 0.04280477710772142, "grad_norm": 0.2160085290670395, "learning_rate": 4.0611295906197706e-05, "loss": 0.4048, "num_tokens": 229185285.0, "step": 362 }, { "epoch": 0.04292302234835048, "grad_norm": 0.21602442860603333, "learning_rate": 4.0511871525744224e-05, "loss": 0.3995, "num_tokens": 229815886.0, "step": 363 }, { "epoch": 0.04304126758897955, "grad_norm": 0.26638063788414, "learning_rate": 4.041233661544574e-05, "loss": 0.4104, "num_tokens": 230449875.0, "step": 364 }, { "epoch": 0.04315951282960861, "grad_norm": 0.21101397275924683, "learning_rate": 4.0312692639861146e-05, "loss": 0.4125, "num_tokens": 231087769.0, "step": 365 }, { "epoch": 0.043277758070237676, "grad_norm": 0.22914250195026398, "learning_rate": 4.021294106515411e-05, "loss": 0.3969, "num_tokens": 231720719.0, "step": 366 }, { "epoch": 0.04339600331086674, "grad_norm": 0.21389196813106537, "learning_rate": 4.011308335907152e-05, "loss": 0.3922, "num_tokens": 232354694.0, "step": 367 }, { "epoch": 0.0435142485514958, "grad_norm": 0.22924332320690155, "learning_rate": 4.00131209909219e-05, "loss": 0.4202, "num_tokens": 232986853.0, "step": 368 }, { "epoch": 0.04363249379212487, "grad_norm": 0.2374032735824585, "learning_rate": 3.991305543155378e-05, "loss": 0.4575, "num_tokens": 233626246.0, "step": 369 }, { "epoch": 0.04375073903275393, "grad_norm": 0.20903757214546204, "learning_rate": 3.981288815333399e-05, "loss": 0.3508, "num_tokens": 234256236.0, "step": 370 }, { "epoch": 0.043868984273383, "grad_norm": 0.23430699110031128, "learning_rate": 3.971262063012612e-05, "loss": 0.4202, "num_tokens": 234894656.0, "step": 371 }, { "epoch": 0.04398722951401206, "grad_norm": 0.21054008603096008, "learning_rate": 3.9612254337268734e-05, "loss": 0.4029, "num_tokens": 235530175.0, "step": 372 }, { "epoch": 0.044105474754641126, "grad_norm": 0.22597409784793854, "learning_rate": 3.95117907515537e-05, "loss": 0.3881, "num_tokens": 236165286.0, "step": 373 }, { "epoch": 0.04422371999527019, "grad_norm": 0.24336762726306915, "learning_rate": 3.941123135120445e-05, "loss": 0.389, "num_tokens": 236799872.0, "step": 374 }, { "epoch": 0.044341965235899256, "grad_norm": 0.2279030978679657, "learning_rate": 3.9310577615854264e-05, "loss": 0.3643, "num_tokens": 237436361.0, "step": 375 }, { "epoch": 0.04446021047652832, "grad_norm": 0.20615456998348236, "learning_rate": 3.920983102652443e-05, "loss": 0.3824, "num_tokens": 238072053.0, "step": 376 }, { "epoch": 0.044578455717157385, "grad_norm": 0.22816775739192963, "learning_rate": 3.910899306560251e-05, "loss": 0.4291, "num_tokens": 238707861.0, "step": 377 }, { "epoch": 0.044696700957786446, "grad_norm": 0.22566092014312744, "learning_rate": 3.9008065216820486e-05, "loss": 0.3967, "num_tokens": 239340071.0, "step": 378 }, { "epoch": 0.044814946198415515, "grad_norm": 0.22702094912528992, "learning_rate": 3.890704896523302e-05, "loss": 0.4185, "num_tokens": 239974165.0, "step": 379 }, { "epoch": 0.044933191439044576, "grad_norm": 0.20416148006916046, "learning_rate": 3.880594579719545e-05, "loss": 0.3879, "num_tokens": 240606077.0, "step": 380 }, { "epoch": 0.045051436679673644, "grad_norm": 0.2429252415895462, "learning_rate": 3.870475720034206e-05, "loss": 0.4027, "num_tokens": 241243195.0, "step": 381 }, { "epoch": 0.045169681920302705, "grad_norm": 0.24931378662586212, "learning_rate": 3.860348466356413e-05, "loss": 0.4474, "num_tokens": 241881692.0, "step": 382 }, { "epoch": 0.045287927160931774, "grad_norm": 0.26254212856292725, "learning_rate": 3.850212967698799e-05, "loss": 0.4189, "num_tokens": 242520949.0, "step": 383 }, { "epoch": 0.045406172401560835, "grad_norm": 0.2300311028957367, "learning_rate": 3.84006937319532e-05, "loss": 0.3986, "num_tokens": 243160575.0, "step": 384 }, { "epoch": 0.0455244176421899, "grad_norm": 0.24005557596683502, "learning_rate": 3.829917832099051e-05, "loss": 0.4128, "num_tokens": 243790943.0, "step": 385 }, { "epoch": 0.045642662882818964, "grad_norm": 0.2699725031852722, "learning_rate": 3.819758493779992e-05, "loss": 0.4602, "num_tokens": 244423844.0, "step": 386 }, { "epoch": 0.04576090812344803, "grad_norm": 0.23983405530452728, "learning_rate": 3.8095915077228754e-05, "loss": 0.3914, "num_tokens": 245054470.0, "step": 387 }, { "epoch": 0.045879153364077094, "grad_norm": 0.2433352917432785, "learning_rate": 3.79941702352496e-05, "loss": 0.3811, "num_tokens": 245688487.0, "step": 388 }, { "epoch": 0.04599739860470616, "grad_norm": 0.24374330043792725, "learning_rate": 3.7892351908938326e-05, "loss": 0.4106, "num_tokens": 246325682.0, "step": 389 }, { "epoch": 0.04611564384533522, "grad_norm": 0.21965977549552917, "learning_rate": 3.7790461596452057e-05, "loss": 0.4311, "num_tokens": 246961506.0, "step": 390 }, { "epoch": 0.04623388908596429, "grad_norm": 0.23189356923103333, "learning_rate": 3.7688500797007124e-05, "loss": 0.3798, "num_tokens": 247594032.0, "step": 391 }, { "epoch": 0.04635213432659335, "grad_norm": 0.2253284901380539, "learning_rate": 3.758647101085699e-05, "loss": 0.427, "num_tokens": 248227593.0, "step": 392 }, { "epoch": 0.04647037956722242, "grad_norm": 0.2451157420873642, "learning_rate": 3.748437373927022e-05, "loss": 0.4083, "num_tokens": 248859376.0, "step": 393 }, { "epoch": 0.04658862480785148, "grad_norm": 0.22249139845371246, "learning_rate": 3.738221048450834e-05, "loss": 0.4254, "num_tokens": 249493350.0, "step": 394 }, { "epoch": 0.04670687004848055, "grad_norm": 0.24161191284656525, "learning_rate": 3.7279982749803736e-05, "loss": 0.3853, "num_tokens": 250126507.0, "step": 395 }, { "epoch": 0.04682511528910961, "grad_norm": 0.23410917818546295, "learning_rate": 3.717769203933759e-05, "loss": 0.424, "num_tokens": 250765825.0, "step": 396 }, { "epoch": 0.04694336052973868, "grad_norm": 0.23993557691574097, "learning_rate": 3.7075339858217706e-05, "loss": 0.4189, "num_tokens": 251403372.0, "step": 397 }, { "epoch": 0.04706160577036774, "grad_norm": 0.32063019275665283, "learning_rate": 3.697292771245633e-05, "loss": 0.4365, "num_tokens": 252041762.0, "step": 398 }, { "epoch": 0.04717985101099681, "grad_norm": 0.21619325876235962, "learning_rate": 3.687045710894808e-05, "loss": 0.4027, "num_tokens": 252673716.0, "step": 399 }, { "epoch": 0.04729809625162587, "grad_norm": 0.2204645574092865, "learning_rate": 3.67679295554477e-05, "loss": 0.4117, "num_tokens": 253310805.0, "step": 400 }, { "epoch": 0.04741634149225494, "grad_norm": 0.21249093115329742, "learning_rate": 3.666534656054788e-05, "loss": 0.3398, "num_tokens": 253944650.0, "step": 401 }, { "epoch": 0.047534586732884, "grad_norm": 0.2515881061553955, "learning_rate": 3.65627096336571e-05, "loss": 0.3814, "num_tokens": 254579224.0, "step": 402 }, { "epoch": 0.04765283197351307, "grad_norm": 0.22720524668693542, "learning_rate": 3.646002028497738e-05, "loss": 0.3981, "num_tokens": 255212393.0, "step": 403 }, { "epoch": 0.04777107721414213, "grad_norm": 0.24506784975528717, "learning_rate": 3.63572800254821e-05, "loss": 0.4215, "num_tokens": 255846216.0, "step": 404 }, { "epoch": 0.0478893224547712, "grad_norm": 0.25425246357917786, "learning_rate": 3.625449036689372e-05, "loss": 0.429, "num_tokens": 256484541.0, "step": 405 }, { "epoch": 0.04800756769540026, "grad_norm": 0.23869769275188446, "learning_rate": 3.6151652821661576e-05, "loss": 0.3881, "num_tokens": 257114691.0, "step": 406 }, { "epoch": 0.04812581293602933, "grad_norm": 0.2546592056751251, "learning_rate": 3.604876890293959e-05, "loss": 0.4059, "num_tokens": 257748044.0, "step": 407 }, { "epoch": 0.04824405817665839, "grad_norm": 0.22846068441867828, "learning_rate": 3.594584012456403e-05, "loss": 0.3613, "num_tokens": 258386984.0, "step": 408 }, { "epoch": 0.04836230341728746, "grad_norm": 0.24633820354938507, "learning_rate": 3.584286800103124e-05, "loss": 0.4298, "num_tokens": 259023318.0, "step": 409 }, { "epoch": 0.04848054865791652, "grad_norm": 0.2492648810148239, "learning_rate": 3.573985404747535e-05, "loss": 0.383, "num_tokens": 259657204.0, "step": 410 }, { "epoch": 0.048598793898545586, "grad_norm": 0.22464512288570404, "learning_rate": 3.563679977964595e-05, "loss": 0.3838, "num_tokens": 260290556.0, "step": 411 }, { "epoch": 0.04871703913917465, "grad_norm": 0.27683940529823303, "learning_rate": 3.5533706713885844e-05, "loss": 0.4461, "num_tokens": 260928576.0, "step": 412 }, { "epoch": 0.048835284379803716, "grad_norm": 0.20443028211593628, "learning_rate": 3.5430576367108694e-05, "loss": 0.3948, "num_tokens": 261563484.0, "step": 413 }, { "epoch": 0.04895352962043278, "grad_norm": 0.20911704003810883, "learning_rate": 3.532741025677673e-05, "loss": 0.3649, "num_tokens": 262198058.0, "step": 414 }, { "epoch": 0.049071774861061845, "grad_norm": 0.27862629294395447, "learning_rate": 3.522420990087839e-05, "loss": 0.4237, "num_tokens": 262825300.0, "step": 415 }, { "epoch": 0.04919002010169091, "grad_norm": 0.2638210654258728, "learning_rate": 3.5120976817906e-05, "loss": 0.4384, "num_tokens": 263458362.0, "step": 416 }, { "epoch": 0.049308265342319975, "grad_norm": 0.24697713553905487, "learning_rate": 3.5017712526833454e-05, "loss": 0.3814, "num_tokens": 264088367.0, "step": 417 }, { "epoch": 0.049426510582949036, "grad_norm": 0.2173382192850113, "learning_rate": 3.491441854709384e-05, "loss": 0.3949, "num_tokens": 264724592.0, "step": 418 }, { "epoch": 0.049544755823578104, "grad_norm": 0.25613975524902344, "learning_rate": 3.481109639855707e-05, "loss": 0.3821, "num_tokens": 265360262.0, "step": 419 }, { "epoch": 0.049663001064207166, "grad_norm": 0.24708124995231628, "learning_rate": 3.470774760150753e-05, "loss": 0.4341, "num_tokens": 265997689.0, "step": 420 }, { "epoch": 0.04978124630483623, "grad_norm": 0.23348525166511536, "learning_rate": 3.460437367662173e-05, "loss": 0.4044, "num_tokens": 266631262.0, "step": 421 }, { "epoch": 0.049899491545465295, "grad_norm": 0.23553021252155304, "learning_rate": 3.450097614494592e-05, "loss": 0.3966, "num_tokens": 267268979.0, "step": 422 }, { "epoch": 0.050017736786094356, "grad_norm": 0.2573988139629364, "learning_rate": 3.439755652787366e-05, "loss": 0.4017, "num_tokens": 267904627.0, "step": 423 }, { "epoch": 0.050135982026723425, "grad_norm": 0.22248908877372742, "learning_rate": 3.4294116347123505e-05, "loss": 0.357, "num_tokens": 268543181.0, "step": 424 }, { "epoch": 0.050254227267352486, "grad_norm": 0.22894316911697388, "learning_rate": 3.419065712471659e-05, "loss": 0.4027, "num_tokens": 269179996.0, "step": 425 }, { "epoch": 0.050372472507981554, "grad_norm": 0.25380998849868774, "learning_rate": 3.4087180382954214e-05, "loss": 0.3843, "num_tokens": 269811253.0, "step": 426 }, { "epoch": 0.050490717748610615, "grad_norm": 0.23106823861598969, "learning_rate": 3.398368764439546e-05, "loss": 0.39, "num_tokens": 270441984.0, "step": 427 }, { "epoch": 0.050608962989239684, "grad_norm": 0.22412751615047455, "learning_rate": 3.388018043183478e-05, "loss": 0.3997, "num_tokens": 271074224.0, "step": 428 }, { "epoch": 0.050727208229868745, "grad_norm": 0.2578945457935333, "learning_rate": 3.377666026827962e-05, "loss": 0.446, "num_tokens": 271705707.0, "step": 429 }, { "epoch": 0.05084545347049781, "grad_norm": 0.23338672518730164, "learning_rate": 3.367312867692797e-05, "loss": 0.379, "num_tokens": 272318335.0, "step": 430 }, { "epoch": 0.050963698711126874, "grad_norm": 0.209132581949234, "learning_rate": 3.3569587181145974e-05, "loss": 0.416, "num_tokens": 272955472.0, "step": 431 }, { "epoch": 0.05108194395175594, "grad_norm": 0.21573707461357117, "learning_rate": 3.346603730444549e-05, "loss": 0.4051, "num_tokens": 273559901.0, "step": 432 }, { "epoch": 0.051200189192385004, "grad_norm": 0.24565227329730988, "learning_rate": 3.336248057046174e-05, "loss": 0.4033, "num_tokens": 274192648.0, "step": 433 }, { "epoch": 0.05131843443301407, "grad_norm": 0.20935006439685822, "learning_rate": 3.325891850293078e-05, "loss": 0.373, "num_tokens": 274831825.0, "step": 434 }, { "epoch": 0.05143667967364313, "grad_norm": 0.2096380591392517, "learning_rate": 3.315535262566722e-05, "loss": 0.358, "num_tokens": 275456510.0, "step": 435 }, { "epoch": 0.0515549249142722, "grad_norm": 0.22659966349601746, "learning_rate": 3.305178446254166e-05, "loss": 0.3623, "num_tokens": 276091876.0, "step": 436 }, { "epoch": 0.05167317015490126, "grad_norm": 0.21803656220436096, "learning_rate": 3.294821553745835e-05, "loss": 0.4107, "num_tokens": 276727335.0, "step": 437 }, { "epoch": 0.05179141539553033, "grad_norm": 0.207914799451828, "learning_rate": 3.284464737433279e-05, "loss": 0.4361, "num_tokens": 277359081.0, "step": 438 }, { "epoch": 0.05190966063615939, "grad_norm": 0.20631778240203857, "learning_rate": 3.2741081497069215e-05, "loss": 0.3707, "num_tokens": 277990765.0, "step": 439 }, { "epoch": 0.05202790587678846, "grad_norm": 0.2180744856595993, "learning_rate": 3.263751942953828e-05, "loss": 0.3962, "num_tokens": 278612675.0, "step": 440 }, { "epoch": 0.05214615111741752, "grad_norm": 0.19695664942264557, "learning_rate": 3.2533962695554515e-05, "loss": 0.3742, "num_tokens": 279246447.0, "step": 441 }, { "epoch": 0.05226439635804659, "grad_norm": 0.2566263973712921, "learning_rate": 3.243041281885404e-05, "loss": 0.4438, "num_tokens": 279880451.0, "step": 442 }, { "epoch": 0.05238264159867565, "grad_norm": 0.25131720304489136, "learning_rate": 3.232687132307204e-05, "loss": 0.4346, "num_tokens": 280517149.0, "step": 443 }, { "epoch": 0.05250088683930472, "grad_norm": 0.21113261580467224, "learning_rate": 3.222333973172039e-05, "loss": 0.3724, "num_tokens": 281144009.0, "step": 444 }, { "epoch": 0.05261913207993378, "grad_norm": 0.19451619684696198, "learning_rate": 3.211981956816523e-05, "loss": 0.3889, "num_tokens": 281781076.0, "step": 445 }, { "epoch": 0.05273737732056285, "grad_norm": 0.20863431692123413, "learning_rate": 3.201631235560456e-05, "loss": 0.3956, "num_tokens": 282418028.0, "step": 446 }, { "epoch": 0.05285562256119191, "grad_norm": 0.22713254392147064, "learning_rate": 3.1912819617045805e-05, "loss": 0.385, "num_tokens": 283052667.0, "step": 447 }, { "epoch": 0.05297386780182098, "grad_norm": 0.22602516412734985, "learning_rate": 3.180934287528342e-05, "loss": 0.4338, "num_tokens": 283689384.0, "step": 448 }, { "epoch": 0.05309211304245004, "grad_norm": 0.18873152136802673, "learning_rate": 3.170588365287651e-05, "loss": 0.3618, "num_tokens": 284325980.0, "step": 449 }, { "epoch": 0.05321035828307911, "grad_norm": 0.22596846520900726, "learning_rate": 3.1602443472126344e-05, "loss": 0.4158, "num_tokens": 284927410.0, "step": 450 }, { "epoch": 0.05332860352370817, "grad_norm": 0.20799311995506287, "learning_rate": 3.1499023855054086e-05, "loss": 0.4023, "num_tokens": 285561853.0, "step": 451 }, { "epoch": 0.05344684876433724, "grad_norm": 0.21219973266124725, "learning_rate": 3.1395626323378266e-05, "loss": 0.4127, "num_tokens": 286200667.0, "step": 452 }, { "epoch": 0.0535650940049663, "grad_norm": 0.23271985352039337, "learning_rate": 3.129225239849247e-05, "loss": 0.377, "num_tokens": 286838297.0, "step": 453 }, { "epoch": 0.05368333924559537, "grad_norm": 0.23054036498069763, "learning_rate": 3.118890360144293e-05, "loss": 0.3806, "num_tokens": 287478013.0, "step": 454 }, { "epoch": 0.05380158448622443, "grad_norm": 0.21830712258815765, "learning_rate": 3.1085581452906166e-05, "loss": 0.4124, "num_tokens": 288113641.0, "step": 455 }, { "epoch": 0.053919829726853497, "grad_norm": 0.21437396109104156, "learning_rate": 3.0982287473166544e-05, "loss": 0.4056, "num_tokens": 288748606.0, "step": 456 }, { "epoch": 0.05403807496748256, "grad_norm": 0.23408770561218262, "learning_rate": 3.087902318209401e-05, "loss": 0.3841, "num_tokens": 289387862.0, "step": 457 }, { "epoch": 0.054156320208111626, "grad_norm": 0.22132480144500732, "learning_rate": 3.0775790099121615e-05, "loss": 0.3859, "num_tokens": 290025351.0, "step": 458 }, { "epoch": 0.05427456544874069, "grad_norm": 0.21784645318984985, "learning_rate": 3.067258974322328e-05, "loss": 0.3752, "num_tokens": 290660065.0, "step": 459 }, { "epoch": 0.054392810689369755, "grad_norm": 0.21862168610095978, "learning_rate": 3.056942363289131e-05, "loss": 0.3808, "num_tokens": 291293769.0, "step": 460 }, { "epoch": 0.05451105592999882, "grad_norm": 0.23824048042297363, "learning_rate": 3.0466293286114164e-05, "loss": 0.3878, "num_tokens": 291932970.0, "step": 461 }, { "epoch": 0.054629301170627885, "grad_norm": 0.20896966755390167, "learning_rate": 3.036320022035405e-05, "loss": 0.3958, "num_tokens": 292570196.0, "step": 462 }, { "epoch": 0.054747546411256946, "grad_norm": 0.22659938037395477, "learning_rate": 3.0260145952524658e-05, "loss": 0.4281, "num_tokens": 293208953.0, "step": 463 }, { "epoch": 0.054865791651886014, "grad_norm": 0.25533124804496765, "learning_rate": 3.0157131998968765e-05, "loss": 0.3964, "num_tokens": 293841194.0, "step": 464 }, { "epoch": 0.054984036892515076, "grad_norm": 0.21840247511863708, "learning_rate": 3.0054159875435977e-05, "loss": 0.4031, "num_tokens": 294478601.0, "step": 465 }, { "epoch": 0.055102282133144144, "grad_norm": 0.20685090124607086, "learning_rate": 2.995123109706042e-05, "loss": 0.3979, "num_tokens": 295110727.0, "step": 466 }, { "epoch": 0.055220527373773205, "grad_norm": 0.24118992686271667, "learning_rate": 2.984834717833843e-05, "loss": 0.3724, "num_tokens": 295744614.0, "step": 467 }, { "epoch": 0.05533877261440227, "grad_norm": 0.21696403622627258, "learning_rate": 2.9745509633106285e-05, "loss": 0.3875, "num_tokens": 296380861.0, "step": 468 }, { "epoch": 0.055457017855031335, "grad_norm": 0.2347799837589264, "learning_rate": 2.964271997451791e-05, "loss": 0.3951, "num_tokens": 297007279.0, "step": 469 }, { "epoch": 0.0555752630956604, "grad_norm": 0.2174369990825653, "learning_rate": 2.9539979715022626e-05, "loss": 0.3757, "num_tokens": 297623126.0, "step": 470 }, { "epoch": 0.055693508336289464, "grad_norm": 0.2637596130371094, "learning_rate": 2.943729036634291e-05, "loss": 0.4343, "num_tokens": 298255789.0, "step": 471 }, { "epoch": 0.05581175357691853, "grad_norm": 0.20758095383644104, "learning_rate": 2.9334653439452135e-05, "loss": 0.4108, "num_tokens": 298885491.0, "step": 472 }, { "epoch": 0.055929998817547594, "grad_norm": 0.2174261063337326, "learning_rate": 2.9232070444552315e-05, "loss": 0.3799, "num_tokens": 299521680.0, "step": 473 }, { "epoch": 0.056048244058176655, "grad_norm": 0.23763810098171234, "learning_rate": 2.9129542891051922e-05, "loss": 0.3902, "num_tokens": 300161168.0, "step": 474 }, { "epoch": 0.05616648929880572, "grad_norm": 0.21108706295490265, "learning_rate": 2.9027072287543666e-05, "loss": 0.3795, "num_tokens": 300793878.0, "step": 475 }, { "epoch": 0.056284734539434784, "grad_norm": 0.22178317606449127, "learning_rate": 2.89246601417823e-05, "loss": 0.4045, "num_tokens": 301427428.0, "step": 476 }, { "epoch": 0.05640297978006385, "grad_norm": 0.23109114170074463, "learning_rate": 2.8822307960662403e-05, "loss": 0.4327, "num_tokens": 302063334.0, "step": 477 }, { "epoch": 0.056521225020692914, "grad_norm": 0.23473629355430603, "learning_rate": 2.8720017250196266e-05, "loss": 0.3764, "num_tokens": 302696034.0, "step": 478 }, { "epoch": 0.05663947026132198, "grad_norm": 0.22509372234344482, "learning_rate": 2.861778951549167e-05, "loss": 0.4, "num_tokens": 303331655.0, "step": 479 }, { "epoch": 0.056757715501951043, "grad_norm": 0.2507939040660858, "learning_rate": 2.851562626072978e-05, "loss": 0.4712, "num_tokens": 303962802.0, "step": 480 }, { "epoch": 0.05687596074258011, "grad_norm": 0.22742438316345215, "learning_rate": 2.8413528989143004e-05, "loss": 0.3897, "num_tokens": 304595703.0, "step": 481 }, { "epoch": 0.05699420598320917, "grad_norm": 0.2183639109134674, "learning_rate": 2.8311499202992885e-05, "loss": 0.3931, "num_tokens": 305227732.0, "step": 482 }, { "epoch": 0.05711245122383824, "grad_norm": 0.21615217626094818, "learning_rate": 2.820953840354795e-05, "loss": 0.3938, "num_tokens": 305861092.0, "step": 483 }, { "epoch": 0.0572306964644673, "grad_norm": 0.22431129217147827, "learning_rate": 2.810764809106168e-05, "loss": 0.3977, "num_tokens": 306497173.0, "step": 484 }, { "epoch": 0.05734894170509637, "grad_norm": 0.2126999795436859, "learning_rate": 2.800582976475041e-05, "loss": 0.3847, "num_tokens": 307133773.0, "step": 485 }, { "epoch": 0.05746718694572543, "grad_norm": 0.21983444690704346, "learning_rate": 2.7904084922771254e-05, "loss": 0.3773, "num_tokens": 307760142.0, "step": 486 }, { "epoch": 0.0575854321863545, "grad_norm": 0.20621925592422485, "learning_rate": 2.7802415062200087e-05, "loss": 0.4089, "num_tokens": 308392658.0, "step": 487 }, { "epoch": 0.05770367742698356, "grad_norm": 0.2080400586128235, "learning_rate": 2.77008216790095e-05, "loss": 0.3654, "num_tokens": 309023200.0, "step": 488 }, { "epoch": 0.05782192266761263, "grad_norm": 0.21669632196426392, "learning_rate": 2.759930626804681e-05, "loss": 0.4097, "num_tokens": 309657191.0, "step": 489 }, { "epoch": 0.05794016790824169, "grad_norm": 0.2028190642595291, "learning_rate": 2.7497870323012014e-05, "loss": 0.4037, "num_tokens": 310290361.0, "step": 490 }, { "epoch": 0.05805841314887076, "grad_norm": 0.23138827085494995, "learning_rate": 2.7396515336435878e-05, "loss": 0.4207, "num_tokens": 310922697.0, "step": 491 }, { "epoch": 0.05817665838949982, "grad_norm": 0.23582817614078522, "learning_rate": 2.7295242799657938e-05, "loss": 0.4111, "num_tokens": 311557453.0, "step": 492 }, { "epoch": 0.05829490363012889, "grad_norm": 0.20863734185695648, "learning_rate": 2.7194054202804555e-05, "loss": 0.4126, "num_tokens": 312193193.0, "step": 493 }, { "epoch": 0.05841314887075795, "grad_norm": 0.21243295073509216, "learning_rate": 2.709295103476699e-05, "loss": 0.4107, "num_tokens": 312828473.0, "step": 494 }, { "epoch": 0.05853139411138702, "grad_norm": 0.21561166644096375, "learning_rate": 2.6991934783179515e-05, "loss": 0.3824, "num_tokens": 313462344.0, "step": 495 }, { "epoch": 0.05864963935201608, "grad_norm": 0.23026616871356964, "learning_rate": 2.6891006934397505e-05, "loss": 0.3821, "num_tokens": 314080640.0, "step": 496 }, { "epoch": 0.05876788459264515, "grad_norm": 0.2129206657409668, "learning_rate": 2.6790168973475585e-05, "loss": 0.3938, "num_tokens": 314717785.0, "step": 497 }, { "epoch": 0.05888612983327421, "grad_norm": 0.23650778830051422, "learning_rate": 2.6689422384145744e-05, "loss": 0.4503, "num_tokens": 315351322.0, "step": 498 }, { "epoch": 0.05900437507390328, "grad_norm": 0.20518648624420166, "learning_rate": 2.658876864879555e-05, "loss": 0.4028, "num_tokens": 315987690.0, "step": 499 }, { "epoch": 0.05912262031453234, "grad_norm": 191.65394592285156, "learning_rate": 2.648820924844631e-05, "loss": 4.9729, "num_tokens": 316588692.0, "step": 500 }, { "epoch": 0.05924086555516141, "grad_norm": 0.27919653058052063, "learning_rate": 2.6387745662731268e-05, "loss": 0.3813, "num_tokens": 317208507.0, "step": 501 }, { "epoch": 0.05935911079579047, "grad_norm": 0.23787546157836914, "learning_rate": 2.6287379369873878e-05, "loss": 0.4319, "num_tokens": 317844277.0, "step": 502 }, { "epoch": 0.059477356036419536, "grad_norm": 0.24857866764068604, "learning_rate": 2.6187111846666015e-05, "loss": 0.4168, "num_tokens": 318478032.0, "step": 503 }, { "epoch": 0.0595956012770486, "grad_norm": 0.2471940517425537, "learning_rate": 2.6086944568446233e-05, "loss": 0.4461, "num_tokens": 319114663.0, "step": 504 }, { "epoch": 0.059713846517677666, "grad_norm": 0.23387108743190765, "learning_rate": 2.5986879009078095e-05, "loss": 0.3444, "num_tokens": 319744940.0, "step": 505 }, { "epoch": 0.05983209175830673, "grad_norm": 0.2251531183719635, "learning_rate": 2.5886916640928474e-05, "loss": 0.3914, "num_tokens": 320377220.0, "step": 506 }, { "epoch": 0.059950336998935795, "grad_norm": 0.22722141444683075, "learning_rate": 2.57870589348459e-05, "loss": 0.3942, "num_tokens": 321010441.0, "step": 507 }, { "epoch": 0.060068582239564856, "grad_norm": 0.22561167180538177, "learning_rate": 2.568730736013887e-05, "loss": 0.3771, "num_tokens": 321643644.0, "step": 508 }, { "epoch": 0.060186827480193925, "grad_norm": 0.22242951393127441, "learning_rate": 2.5587663384554264e-05, "loss": 0.3877, "num_tokens": 322275355.0, "step": 509 }, { "epoch": 0.060305072720822986, "grad_norm": 0.22118404507637024, "learning_rate": 2.5488128474255777e-05, "loss": 0.4112, "num_tokens": 322908591.0, "step": 510 }, { "epoch": 0.060423317961452054, "grad_norm": 0.2330280840396881, "learning_rate": 2.5388704093802296e-05, "loss": 0.4106, "num_tokens": 323542459.0, "step": 511 }, { "epoch": 0.060541563202081115, "grad_norm": 0.2232893407344818, "learning_rate": 2.5289391706126375e-05, "loss": 0.3905, "num_tokens": 324176254.0, "step": 512 }, { "epoch": 0.060659808442710184, "grad_norm": 0.20871341228485107, "learning_rate": 2.5190192772512675e-05, "loss": 0.3664, "num_tokens": 324803884.0, "step": 513 }, { "epoch": 0.060778053683339245, "grad_norm": 0.2513749599456787, "learning_rate": 2.509110875257654e-05, "loss": 0.4212, "num_tokens": 325442394.0, "step": 514 }, { "epoch": 0.06089629892396831, "grad_norm": 0.2195710688829422, "learning_rate": 2.4992141104242444e-05, "loss": 0.4119, "num_tokens": 326081004.0, "step": 515 }, { "epoch": 0.061014544164597374, "grad_norm": 0.2243558168411255, "learning_rate": 2.4893291283722552e-05, "loss": 0.3926, "num_tokens": 326711646.0, "step": 516 }, { "epoch": 0.06113278940522644, "grad_norm": 0.2674740254878998, "learning_rate": 2.479456074549534e-05, "loss": 0.387, "num_tokens": 327346340.0, "step": 517 }, { "epoch": 0.061251034645855504, "grad_norm": 0.19878649711608887, "learning_rate": 2.469595094228415e-05, "loss": 0.3669, "num_tokens": 327982364.0, "step": 518 }, { "epoch": 0.06136927988648457, "grad_norm": 0.24535562098026276, "learning_rate": 2.4597463325035814e-05, "loss": 0.4298, "num_tokens": 328613513.0, "step": 519 }, { "epoch": 0.06148752512711363, "grad_norm": 0.2603405714035034, "learning_rate": 2.4499099342899335e-05, "loss": 0.4045, "num_tokens": 329246797.0, "step": 520 }, { "epoch": 0.0616057703677427, "grad_norm": 0.2385442852973938, "learning_rate": 2.4400860443204524e-05, "loss": 0.4146, "num_tokens": 329882051.0, "step": 521 }, { "epoch": 0.06172401560837176, "grad_norm": 0.2290627807378769, "learning_rate": 2.4302748071440763e-05, "loss": 0.3872, "num_tokens": 330518781.0, "step": 522 }, { "epoch": 0.06184226084900083, "grad_norm": 0.22756050527095795, "learning_rate": 2.4204763671235655e-05, "loss": 0.3926, "num_tokens": 331148822.0, "step": 523 }, { "epoch": 0.06196050608962989, "grad_norm": 0.24352355301380157, "learning_rate": 2.4106908684333856e-05, "loss": 0.4102, "num_tokens": 331779530.0, "step": 524 }, { "epoch": 0.06207875133025896, "grad_norm": 0.24109165370464325, "learning_rate": 2.4009184550575824e-05, "loss": 0.3531, "num_tokens": 332409781.0, "step": 525 }, { "epoch": 0.06219699657088802, "grad_norm": 0.2488730251789093, "learning_rate": 2.3911592707876643e-05, "loss": 0.4369, "num_tokens": 333047880.0, "step": 526 }, { "epoch": 0.06231524181151708, "grad_norm": 0.19400961697101593, "learning_rate": 2.381413459220485e-05, "loss": 0.4013, "num_tokens": 333683586.0, "step": 527 }, { "epoch": 0.06243348705214615, "grad_norm": 0.23837246000766754, "learning_rate": 2.371681163756134e-05, "loss": 0.4032, "num_tokens": 334315506.0, "step": 528 }, { "epoch": 0.06255173229277522, "grad_norm": 0.21308279037475586, "learning_rate": 2.361962527595824e-05, "loss": 0.3715, "num_tokens": 334951291.0, "step": 529 }, { "epoch": 0.06266997753340428, "grad_norm": 0.2166317254304886, "learning_rate": 2.352257693739783e-05, "loss": 0.3831, "num_tokens": 335585867.0, "step": 530 }, { "epoch": 0.06278822277403334, "grad_norm": 0.22203749418258667, "learning_rate": 2.3425668049851535e-05, "loss": 0.3727, "num_tokens": 336220052.0, "step": 531 }, { "epoch": 0.0629064680146624, "grad_norm": 0.20896011590957642, "learning_rate": 2.3328900039238882e-05, "loss": 0.4235, "num_tokens": 336856340.0, "step": 532 }, { "epoch": 0.06302471325529148, "grad_norm": 0.19895263016223907, "learning_rate": 2.323227432940654e-05, "loss": 0.3766, "num_tokens": 337491980.0, "step": 533 }, { "epoch": 0.06314295849592054, "grad_norm": 0.21203070878982544, "learning_rate": 2.3135792342107335e-05, "loss": 0.3798, "num_tokens": 338130649.0, "step": 534 }, { "epoch": 0.0632612037365496, "grad_norm": 0.24771364033222198, "learning_rate": 2.3039455496979403e-05, "loss": 0.4252, "num_tokens": 338765865.0, "step": 535 }, { "epoch": 0.06337944897717866, "grad_norm": 0.2177121639251709, "learning_rate": 2.294326521152522e-05, "loss": 0.3969, "num_tokens": 339363366.0, "step": 536 }, { "epoch": 0.06349769421780774, "grad_norm": 0.2345789521932602, "learning_rate": 2.2847222901090787e-05, "loss": 0.4415, "num_tokens": 339999650.0, "step": 537 }, { "epoch": 0.0636159394584368, "grad_norm": 0.21585899591445923, "learning_rate": 2.2751329978844802e-05, "loss": 0.4005, "num_tokens": 340634297.0, "step": 538 }, { "epoch": 0.06373418469906586, "grad_norm": 0.23928019404411316, "learning_rate": 2.2655587855757862e-05, "loss": 0.4249, "num_tokens": 341269246.0, "step": 539 }, { "epoch": 0.06385242993969492, "grad_norm": 0.2342565506696701, "learning_rate": 2.255999794058169e-05, "loss": 0.4108, "num_tokens": 341900107.0, "step": 540 }, { "epoch": 0.063970675180324, "grad_norm": 0.2086341232061386, "learning_rate": 2.246456163982845e-05, "loss": 0.4149, "num_tokens": 342539152.0, "step": 541 }, { "epoch": 0.06408892042095306, "grad_norm": 0.20828045904636383, "learning_rate": 2.236928035774997e-05, "loss": 0.4131, "num_tokens": 343166271.0, "step": 542 }, { "epoch": 0.06420716566158212, "grad_norm": 0.20667296648025513, "learning_rate": 2.2274155496317174e-05, "loss": 0.3735, "num_tokens": 343801657.0, "step": 543 }, { "epoch": 0.06432541090221118, "grad_norm": 0.20303893089294434, "learning_rate": 2.217918845519939e-05, "loss": 0.3877, "num_tokens": 344436926.0, "step": 544 }, { "epoch": 0.06444365614284026, "grad_norm": 0.1928926706314087, "learning_rate": 2.208438063174377e-05, "loss": 0.3732, "num_tokens": 345071661.0, "step": 545 }, { "epoch": 0.06456190138346932, "grad_norm": 0.24890753626823425, "learning_rate": 2.1989733420954752e-05, "loss": 0.4558, "num_tokens": 345710262.0, "step": 546 }, { "epoch": 0.06468014662409838, "grad_norm": 0.21143190562725067, "learning_rate": 2.1895248215473494e-05, "loss": 0.396, "num_tokens": 346345760.0, "step": 547 }, { "epoch": 0.06479839186472744, "grad_norm": 0.20359720289707184, "learning_rate": 2.1800926405557425e-05, "loss": 0.3731, "num_tokens": 346978472.0, "step": 548 }, { "epoch": 0.06491663710535651, "grad_norm": 0.22057001292705536, "learning_rate": 2.1706769379059748e-05, "loss": 0.3875, "num_tokens": 347615067.0, "step": 549 }, { "epoch": 0.06503488234598558, "grad_norm": 0.21384279429912567, "learning_rate": 2.161277852140905e-05, "loss": 0.4085, "num_tokens": 348251545.0, "step": 550 }, { "epoch": 0.06515312758661464, "grad_norm": 0.2024473398923874, "learning_rate": 2.151895521558892e-05, "loss": 0.3993, "num_tokens": 348888946.0, "step": 551 }, { "epoch": 0.0652713728272437, "grad_norm": 0.23349706828594208, "learning_rate": 2.1425300842117537e-05, "loss": 0.4371, "num_tokens": 349519613.0, "step": 552 }, { "epoch": 0.06538961806787277, "grad_norm": 0.20403353869915009, "learning_rate": 2.133181677902747e-05, "loss": 0.39, "num_tokens": 350152254.0, "step": 553 }, { "epoch": 0.06550786330850183, "grad_norm": 0.21594958007335663, "learning_rate": 2.1238504401845306e-05, "loss": 0.3878, "num_tokens": 350786547.0, "step": 554 }, { "epoch": 0.0656261085491309, "grad_norm": 0.2234022170305252, "learning_rate": 2.1145365083571418e-05, "loss": 0.3961, "num_tokens": 351422383.0, "step": 555 }, { "epoch": 0.06574435378975996, "grad_norm": 0.21230868995189667, "learning_rate": 2.105240019465984e-05, "loss": 0.4062, "num_tokens": 352061087.0, "step": 556 }, { "epoch": 0.06586259903038903, "grad_norm": 0.21539685130119324, "learning_rate": 2.095961110299799e-05, "loss": 0.3688, "num_tokens": 352697788.0, "step": 557 }, { "epoch": 0.0659808442710181, "grad_norm": 0.2293699085712433, "learning_rate": 2.086699917388664e-05, "loss": 0.3892, "num_tokens": 353327836.0, "step": 558 }, { "epoch": 0.06609908951164715, "grad_norm": 0.2052561193704605, "learning_rate": 2.0774565770019797e-05, "loss": 0.38, "num_tokens": 353963351.0, "step": 559 }, { "epoch": 0.06621733475227622, "grad_norm": 0.2142726480960846, "learning_rate": 2.06823122514646e-05, "loss": 0.4144, "num_tokens": 354576403.0, "step": 560 }, { "epoch": 0.06633557999290529, "grad_norm": 0.22821441292762756, "learning_rate": 2.0590239975641387e-05, "loss": 0.4167, "num_tokens": 355211385.0, "step": 561 }, { "epoch": 0.06645382523353435, "grad_norm": 0.2132827341556549, "learning_rate": 2.0498350297303682e-05, "loss": 0.4177, "num_tokens": 355842242.0, "step": 562 }, { "epoch": 0.06657207047416341, "grad_norm": 0.21103453636169434, "learning_rate": 2.0406644568518244e-05, "loss": 0.3693, "num_tokens": 356473678.0, "step": 563 }, { "epoch": 0.06669031571479248, "grad_norm": 0.20970512926578522, "learning_rate": 2.031512413864523e-05, "loss": 0.3613, "num_tokens": 357112492.0, "step": 564 }, { "epoch": 0.06680856095542155, "grad_norm": 0.25300124287605286, "learning_rate": 2.0223790354318263e-05, "loss": 0.3903, "num_tokens": 357748442.0, "step": 565 }, { "epoch": 0.06692680619605061, "grad_norm": 0.18217869102954865, "learning_rate": 2.013264455942469e-05, "loss": 0.3531, "num_tokens": 358387012.0, "step": 566 }, { "epoch": 0.06704505143667967, "grad_norm": 0.202724426984787, "learning_rate": 2.0041688095085776e-05, "loss": 0.3667, "num_tokens": 359025550.0, "step": 567 }, { "epoch": 0.06716329667730873, "grad_norm": 0.2007199376821518, "learning_rate": 1.9950922299636945e-05, "loss": 0.407, "num_tokens": 359661915.0, "step": 568 }, { "epoch": 0.06728154191793781, "grad_norm": 0.212602898478508, "learning_rate": 1.986034850860815e-05, "loss": 0.3709, "num_tokens": 360295739.0, "step": 569 }, { "epoch": 0.06739978715856687, "grad_norm": 0.20929577946662903, "learning_rate": 1.9769968054704174e-05, "loss": 0.4242, "num_tokens": 360929829.0, "step": 570 }, { "epoch": 0.06751803239919593, "grad_norm": 0.19647814333438873, "learning_rate": 1.9679782267785006e-05, "loss": 0.3632, "num_tokens": 361568418.0, "step": 571 }, { "epoch": 0.067636277639825, "grad_norm": 0.22293861210346222, "learning_rate": 1.9589792474846353e-05, "loss": 0.3513, "num_tokens": 362197303.0, "step": 572 }, { "epoch": 0.06775452288045407, "grad_norm": 0.24212084710597992, "learning_rate": 1.9500000000000006e-05, "loss": 0.393, "num_tokens": 362825904.0, "step": 573 }, { "epoch": 0.06787276812108313, "grad_norm": 0.20932357013225555, "learning_rate": 1.9410406164454458e-05, "loss": 0.3854, "num_tokens": 363465140.0, "step": 574 }, { "epoch": 0.06799101336171219, "grad_norm": 0.20063243806362152, "learning_rate": 1.9321012286495403e-05, "loss": 0.3874, "num_tokens": 364097168.0, "step": 575 }, { "epoch": 0.06810925860234125, "grad_norm": 0.1878458708524704, "learning_rate": 1.9231819681466337e-05, "loss": 0.3658, "num_tokens": 364728470.0, "step": 576 }, { "epoch": 0.06822750384297033, "grad_norm": 0.2246370166540146, "learning_rate": 1.914282966174925e-05, "loss": 0.4268, "num_tokens": 365363497.0, "step": 577 }, { "epoch": 0.06834574908359939, "grad_norm": 0.24067296087741852, "learning_rate": 1.9054043536745268e-05, "loss": 0.4456, "num_tokens": 366000699.0, "step": 578 }, { "epoch": 0.06846399432422845, "grad_norm": 0.18838095664978027, "learning_rate": 1.8965462612855428e-05, "loss": 0.3526, "num_tokens": 366624851.0, "step": 579 }, { "epoch": 0.06858223956485751, "grad_norm": 0.1913178265094757, "learning_rate": 1.8877088193461407e-05, "loss": 0.3845, "num_tokens": 367261099.0, "step": 580 }, { "epoch": 0.06870048480548657, "grad_norm": 0.20684710144996643, "learning_rate": 1.878892157890638e-05, "loss": 0.3567, "num_tokens": 367897458.0, "step": 581 }, { "epoch": 0.06881873004611565, "grad_norm": 0.21800653636455536, "learning_rate": 1.8700964066475868e-05, "loss": 0.4243, "num_tokens": 368534927.0, "step": 582 }, { "epoch": 0.06893697528674471, "grad_norm": 0.21104471385478973, "learning_rate": 1.86132169503787e-05, "loss": 0.4147, "num_tokens": 369169358.0, "step": 583 }, { "epoch": 0.06905522052737377, "grad_norm": 0.20770899951457977, "learning_rate": 1.8525681521727856e-05, "loss": 0.405, "num_tokens": 369806601.0, "step": 584 }, { "epoch": 0.06917346576800283, "grad_norm": 0.20592570304870605, "learning_rate": 1.8438359068521625e-05, "loss": 0.3933, "num_tokens": 370442728.0, "step": 585 }, { "epoch": 0.0692917110086319, "grad_norm": 0.20783546566963196, "learning_rate": 1.83512508756245e-05, "loss": 0.4044, "num_tokens": 371079275.0, "step": 586 }, { "epoch": 0.06940995624926097, "grad_norm": 0.20856884121894836, "learning_rate": 1.8264358224748374e-05, "loss": 0.3986, "num_tokens": 371716282.0, "step": 587 }, { "epoch": 0.06952820148989003, "grad_norm": 0.19124871492385864, "learning_rate": 1.817768239443367e-05, "loss": 0.4001, "num_tokens": 372347661.0, "step": 588 }, { "epoch": 0.06964644673051909, "grad_norm": 0.22391672432422638, "learning_rate": 1.8091224660030457e-05, "loss": 0.3906, "num_tokens": 372977936.0, "step": 589 }, { "epoch": 0.06976469197114817, "grad_norm": 0.22682306170463562, "learning_rate": 1.8004986293679783e-05, "loss": 0.4097, "num_tokens": 373613192.0, "step": 590 }, { "epoch": 0.06988293721177723, "grad_norm": 0.1943192332983017, "learning_rate": 1.79189685642949e-05, "loss": 0.4018, "num_tokens": 374251716.0, "step": 591 }, { "epoch": 0.07000118245240629, "grad_norm": 0.1957077533006668, "learning_rate": 1.7833172737542572e-05, "loss": 0.359, "num_tokens": 374880470.0, "step": 592 }, { "epoch": 0.07011942769303535, "grad_norm": 0.21087896823883057, "learning_rate": 1.774760007582453e-05, "loss": 0.399, "num_tokens": 375518014.0, "step": 593 }, { "epoch": 0.07023767293366442, "grad_norm": 0.20506969094276428, "learning_rate": 1.76622518382588e-05, "loss": 0.4005, "num_tokens": 376156899.0, "step": 594 }, { "epoch": 0.07035591817429349, "grad_norm": 0.18575182557106018, "learning_rate": 1.7577129280661264e-05, "loss": 0.3837, "num_tokens": 376796416.0, "step": 595 }, { "epoch": 0.07047416341492255, "grad_norm": 0.20734459161758423, "learning_rate": 1.7492233655527138e-05, "loss": 0.3834, "num_tokens": 377430299.0, "step": 596 }, { "epoch": 0.07059240865555161, "grad_norm": 0.18199484050273895, "learning_rate": 1.7407566212012526e-05, "loss": 0.3334, "num_tokens": 378036010.0, "step": 597 }, { "epoch": 0.07071065389618068, "grad_norm": 0.21089966595172882, "learning_rate": 1.7323128195916088e-05, "loss": 0.4233, "num_tokens": 378662576.0, "step": 598 }, { "epoch": 0.07082889913680974, "grad_norm": 0.19139453768730164, "learning_rate": 1.723892084966068e-05, "loss": 0.3544, "num_tokens": 379292706.0, "step": 599 }, { "epoch": 0.0709471443774388, "grad_norm": 0.20988748967647552, "learning_rate": 1.7154945412275056e-05, "loss": 0.4113, "num_tokens": 379923752.0, "step": 600 }, { "epoch": 0.07106538961806787, "grad_norm": 0.21000663936138153, "learning_rate": 1.7071203119375692e-05, "loss": 0.3831, "num_tokens": 380556540.0, "step": 601 }, { "epoch": 0.07118363485869694, "grad_norm": 0.187398761510849, "learning_rate": 1.698769520314853e-05, "loss": 0.3572, "num_tokens": 381191645.0, "step": 602 }, { "epoch": 0.071301880099326, "grad_norm": 0.1953067183494568, "learning_rate": 1.6904422892330918e-05, "loss": 0.4128, "num_tokens": 381827763.0, "step": 603 }, { "epoch": 0.07142012533995507, "grad_norm": 0.19437581300735474, "learning_rate": 1.68213874121935e-05, "loss": 0.379, "num_tokens": 382466825.0, "step": 604 }, { "epoch": 0.07153837058058413, "grad_norm": 0.21022436022758484, "learning_rate": 1.6738589984522172e-05, "loss": 0.3804, "num_tokens": 383103907.0, "step": 605 }, { "epoch": 0.0716566158212132, "grad_norm": 0.2030460089445114, "learning_rate": 1.665603182760014e-05, "loss": 0.4009, "num_tokens": 383736705.0, "step": 606 }, { "epoch": 0.07177486106184226, "grad_norm": 0.21273180842399597, "learning_rate": 1.657371415618996e-05, "loss": 0.4078, "num_tokens": 384376310.0, "step": 607 }, { "epoch": 0.07189310630247132, "grad_norm": 0.184920996427536, "learning_rate": 1.6491638181515668e-05, "loss": 0.3793, "num_tokens": 385007094.0, "step": 608 }, { "epoch": 0.07201135154310039, "grad_norm": 0.18787135183811188, "learning_rate": 1.6409805111245015e-05, "loss": 0.3604, "num_tokens": 385646534.0, "step": 609 }, { "epoch": 0.07212959678372946, "grad_norm": 0.18394650518894196, "learning_rate": 1.632821614947159e-05, "loss": 0.3549, "num_tokens": 386283106.0, "step": 610 }, { "epoch": 0.07224784202435852, "grad_norm": 0.18069717288017273, "learning_rate": 1.624687249669722e-05, "loss": 0.3509, "num_tokens": 386916395.0, "step": 611 }, { "epoch": 0.07236608726498758, "grad_norm": 0.2196332812309265, "learning_rate": 1.6165775349814197e-05, "loss": 0.3995, "num_tokens": 387553614.0, "step": 612 }, { "epoch": 0.07248433250561664, "grad_norm": 0.20063099265098572, "learning_rate": 1.608492590208777e-05, "loss": 0.3657, "num_tokens": 388189908.0, "step": 613 }, { "epoch": 0.07260257774624572, "grad_norm": 0.2032419592142105, "learning_rate": 1.6004325343138506e-05, "loss": 0.4057, "num_tokens": 388827274.0, "step": 614 }, { "epoch": 0.07272082298687478, "grad_norm": 0.2009783685207367, "learning_rate": 1.5923974858924816e-05, "loss": 0.3746, "num_tokens": 389460786.0, "step": 615 }, { "epoch": 0.07283906822750384, "grad_norm": 0.19908788800239563, "learning_rate": 1.5843875631725528e-05, "loss": 0.3981, "num_tokens": 390066154.0, "step": 616 }, { "epoch": 0.0729573134681329, "grad_norm": 0.1834346354007721, "learning_rate": 1.5764028840122463e-05, "loss": 0.3648, "num_tokens": 390700370.0, "step": 617 }, { "epoch": 0.07307555870876198, "grad_norm": 0.2006380409002304, "learning_rate": 1.568443565898307e-05, "loss": 0.3693, "num_tokens": 391333815.0, "step": 618 }, { "epoch": 0.07319380394939104, "grad_norm": 0.1987355351448059, "learning_rate": 1.5605097259443196e-05, "loss": 0.3864, "num_tokens": 391972641.0, "step": 619 }, { "epoch": 0.0733120491900201, "grad_norm": 0.20351499319076538, "learning_rate": 1.5526014808889836e-05, "loss": 0.4113, "num_tokens": 392607122.0, "step": 620 }, { "epoch": 0.07343029443064916, "grad_norm": 0.20003947615623474, "learning_rate": 1.5447189470943905e-05, "loss": 0.3607, "num_tokens": 393234821.0, "step": 621 }, { "epoch": 0.07354853967127824, "grad_norm": 0.20422472059726715, "learning_rate": 1.536862240544321e-05, "loss": 0.3633, "num_tokens": 393867338.0, "step": 622 }, { "epoch": 0.0736667849119073, "grad_norm": 0.18243864178657532, "learning_rate": 1.5290314768425274e-05, "loss": 0.3752, "num_tokens": 394505968.0, "step": 623 }, { "epoch": 0.07378503015253636, "grad_norm": 0.22229041159152985, "learning_rate": 1.5212267712110427e-05, "loss": 0.4205, "num_tokens": 395143798.0, "step": 624 }, { "epoch": 0.07390327539316542, "grad_norm": 0.19298569858074188, "learning_rate": 1.5134482384884803e-05, "loss": 0.368, "num_tokens": 395781916.0, "step": 625 }, { "epoch": 0.0740215206337945, "grad_norm": 0.20785243809223175, "learning_rate": 1.5056959931283423e-05, "loss": 0.4121, "num_tokens": 396419440.0, "step": 626 }, { "epoch": 0.07413976587442356, "grad_norm": 0.19097504019737244, "learning_rate": 1.4979701491973403e-05, "loss": 0.3539, "num_tokens": 397053137.0, "step": 627 }, { "epoch": 0.07425801111505262, "grad_norm": 0.2216179519891739, "learning_rate": 1.490270820373715e-05, "loss": 0.3927, "num_tokens": 397685003.0, "step": 628 }, { "epoch": 0.07437625635568168, "grad_norm": 0.21564562618732452, "learning_rate": 1.4825981199455601e-05, "loss": 0.4046, "num_tokens": 398323686.0, "step": 629 }, { "epoch": 0.07449450159631076, "grad_norm": 0.20918431878089905, "learning_rate": 1.4749521608091632e-05, "loss": 0.4025, "num_tokens": 398958685.0, "step": 630 }, { "epoch": 0.07461274683693982, "grad_norm": 0.2055424153804779, "learning_rate": 1.4673330554673358e-05, "loss": 0.3961, "num_tokens": 399595823.0, "step": 631 }, { "epoch": 0.07473099207756888, "grad_norm": 0.19133684039115906, "learning_rate": 1.459740916027765e-05, "loss": 0.3868, "num_tokens": 400234363.0, "step": 632 }, { "epoch": 0.07484923731819794, "grad_norm": 0.20725229382514954, "learning_rate": 1.4521758542013575e-05, "loss": 0.3999, "num_tokens": 400860312.0, "step": 633 }, { "epoch": 0.074967482558827, "grad_norm": 0.19468720257282257, "learning_rate": 1.4446379813006028e-05, "loss": 0.3931, "num_tokens": 401493314.0, "step": 634 }, { "epoch": 0.07508572779945608, "grad_norm": 0.21458375453948975, "learning_rate": 1.4371274082379317e-05, "loss": 0.4047, "num_tokens": 402131410.0, "step": 635 }, { "epoch": 0.07520397304008514, "grad_norm": 0.21077150106430054, "learning_rate": 1.4296442455240818e-05, "loss": 0.4181, "num_tokens": 402767694.0, "step": 636 }, { "epoch": 0.0753222182807142, "grad_norm": 0.21079093217849731, "learning_rate": 1.4221886032664769e-05, "loss": 0.4037, "num_tokens": 403401170.0, "step": 637 }, { "epoch": 0.07544046352134326, "grad_norm": 0.1916537582874298, "learning_rate": 1.4147605911676037e-05, "loss": 0.3909, "num_tokens": 404033223.0, "step": 638 }, { "epoch": 0.07555870876197233, "grad_norm": 0.17826271057128906, "learning_rate": 1.4073603185233966e-05, "loss": 0.3837, "num_tokens": 404669142.0, "step": 639 }, { "epoch": 0.0756769540026014, "grad_norm": 0.18769319355487823, "learning_rate": 1.3999878942216336e-05, "loss": 0.3976, "num_tokens": 405305698.0, "step": 640 }, { "epoch": 0.07579519924323046, "grad_norm": 0.21683697402477264, "learning_rate": 1.3926434267403286e-05, "loss": 0.4228, "num_tokens": 405935366.0, "step": 641 }, { "epoch": 0.07591344448385952, "grad_norm": 0.1858586221933365, "learning_rate": 1.3853270241461407e-05, "loss": 0.3949, "num_tokens": 406563939.0, "step": 642 }, { "epoch": 0.0760316897244886, "grad_norm": 0.1963283121585846, "learning_rate": 1.378038794092781e-05, "loss": 0.3806, "num_tokens": 407201876.0, "step": 643 }, { "epoch": 0.07614993496511765, "grad_norm": 0.1992059051990509, "learning_rate": 1.3707788438194276e-05, "loss": 0.3715, "num_tokens": 407834876.0, "step": 644 }, { "epoch": 0.07626818020574672, "grad_norm": 0.19572339951992035, "learning_rate": 1.3635472801491516e-05, "loss": 0.3752, "num_tokens": 408474126.0, "step": 645 }, { "epoch": 0.07638642544637578, "grad_norm": 0.185529887676239, "learning_rate": 1.3563442094873424e-05, "loss": 0.3354, "num_tokens": 409110752.0, "step": 646 }, { "epoch": 0.07650467068700485, "grad_norm": 0.20446783304214478, "learning_rate": 1.349169737820141e-05, "loss": 0.3986, "num_tokens": 409744230.0, "step": 647 }, { "epoch": 0.07662291592763391, "grad_norm": 0.20862102508544922, "learning_rate": 1.3420239707128845e-05, "loss": 0.3885, "num_tokens": 410377730.0, "step": 648 }, { "epoch": 0.07674116116826298, "grad_norm": 0.19482731819152832, "learning_rate": 1.3349070133085478e-05, "loss": 0.369, "num_tokens": 411014041.0, "step": 649 }, { "epoch": 0.07685940640889204, "grad_norm": 0.1799471527338028, "learning_rate": 1.327818970326202e-05, "loss": 0.377, "num_tokens": 411653738.0, "step": 650 }, { "epoch": 0.07697765164952111, "grad_norm": 0.17572778463363647, "learning_rate": 1.3207599460594695e-05, "loss": 0.347, "num_tokens": 412288459.0, "step": 651 }, { "epoch": 0.07709589689015017, "grad_norm": 0.18439733982086182, "learning_rate": 1.31373004437499e-05, "loss": 0.3861, "num_tokens": 412924573.0, "step": 652 }, { "epoch": 0.07721414213077923, "grad_norm": 0.18092259764671326, "learning_rate": 1.3067293687108938e-05, "loss": 0.3428, "num_tokens": 413557882.0, "step": 653 }, { "epoch": 0.0773323873714083, "grad_norm": 0.17916624248027802, "learning_rate": 1.2997580220752791e-05, "loss": 0.3431, "num_tokens": 414190765.0, "step": 654 }, { "epoch": 0.07745063261203737, "grad_norm": 0.18362957239151, "learning_rate": 1.2928161070446937e-05, "loss": 0.3517, "num_tokens": 414824481.0, "step": 655 }, { "epoch": 0.07756887785266643, "grad_norm": 0.18938778340816498, "learning_rate": 1.2859037257626331e-05, "loss": 0.3749, "num_tokens": 415462470.0, "step": 656 }, { "epoch": 0.0776871230932955, "grad_norm": 0.18327617645263672, "learning_rate": 1.2790209799380269e-05, "loss": 0.4054, "num_tokens": 416098823.0, "step": 657 }, { "epoch": 0.07780536833392455, "grad_norm": 0.18833239376544952, "learning_rate": 1.2721679708437516e-05, "loss": 0.3851, "num_tokens": 416727909.0, "step": 658 }, { "epoch": 0.07792361357455363, "grad_norm": 0.21469521522521973, "learning_rate": 1.2653447993151367e-05, "loss": 0.4095, "num_tokens": 417362676.0, "step": 659 }, { "epoch": 0.07804185881518269, "grad_norm": 0.1744416207075119, "learning_rate": 1.2585515657484778e-05, "loss": 0.3625, "num_tokens": 417996258.0, "step": 660 }, { "epoch": 0.07816010405581175, "grad_norm": 0.19375596940517426, "learning_rate": 1.2517883700995673e-05, "loss": 0.4059, "num_tokens": 418626034.0, "step": 661 }, { "epoch": 0.07827834929644081, "grad_norm": 0.19247286021709442, "learning_rate": 1.2450553118822141e-05, "loss": 0.4297, "num_tokens": 419263225.0, "step": 662 }, { "epoch": 0.07839659453706989, "grad_norm": 0.18751764297485352, "learning_rate": 1.238352490166789e-05, "loss": 0.3912, "num_tokens": 419892898.0, "step": 663 }, { "epoch": 0.07851483977769895, "grad_norm": 0.1725941002368927, "learning_rate": 1.2316800035787598e-05, "loss": 0.3779, "num_tokens": 420527528.0, "step": 664 }, { "epoch": 0.07863308501832801, "grad_norm": 0.17689573764801025, "learning_rate": 1.2250379502972414e-05, "loss": 0.3802, "num_tokens": 421156121.0, "step": 665 }, { "epoch": 0.07875133025895707, "grad_norm": 0.17760339379310608, "learning_rate": 1.2184264280535551e-05, "loss": 0.3315, "num_tokens": 421790061.0, "step": 666 }, { "epoch": 0.07886957549958615, "grad_norm": 0.18165592849254608, "learning_rate": 1.2118455341297868e-05, "loss": 0.39, "num_tokens": 422426991.0, "step": 667 }, { "epoch": 0.07898782074021521, "grad_norm": 0.19043633341789246, "learning_rate": 1.2052953653573545e-05, "loss": 0.3475, "num_tokens": 423063834.0, "step": 668 }, { "epoch": 0.07910606598084427, "grad_norm": 0.1863887906074524, "learning_rate": 1.1987760181155897e-05, "loss": 0.3814, "num_tokens": 423703537.0, "step": 669 }, { "epoch": 0.07922431122147333, "grad_norm": 0.19008512794971466, "learning_rate": 1.1922875883303112e-05, "loss": 0.3986, "num_tokens": 424330180.0, "step": 670 }, { "epoch": 0.0793425564621024, "grad_norm": 0.1887669861316681, "learning_rate": 1.1858301714724201e-05, "loss": 0.4111, "num_tokens": 424966976.0, "step": 671 }, { "epoch": 0.07946080170273147, "grad_norm": 0.20428043603897095, "learning_rate": 1.1794038625564926e-05, "loss": 0.3843, "num_tokens": 425604191.0, "step": 672 }, { "epoch": 0.07957904694336053, "grad_norm": 0.20065979659557343, "learning_rate": 1.1730087561393799e-05, "loss": 0.3345, "num_tokens": 426240218.0, "step": 673 }, { "epoch": 0.07969729218398959, "grad_norm": 0.18954698741436005, "learning_rate": 1.1666449463188212e-05, "loss": 0.3979, "num_tokens": 426878525.0, "step": 674 }, { "epoch": 0.07981553742461867, "grad_norm": 0.19051022827625275, "learning_rate": 1.1603125267320565e-05, "loss": 0.3658, "num_tokens": 427512790.0, "step": 675 }, { "epoch": 0.07993378266524773, "grad_norm": 0.21397797763347626, "learning_rate": 1.1540115905544473e-05, "loss": 0.4099, "num_tokens": 428150456.0, "step": 676 }, { "epoch": 0.08005202790587679, "grad_norm": 0.1928778886795044, "learning_rate": 1.1477422304981104e-05, "loss": 0.3455, "num_tokens": 428783253.0, "step": 677 }, { "epoch": 0.08017027314650585, "grad_norm": 0.19302059710025787, "learning_rate": 1.1415045388105477e-05, "loss": 0.3846, "num_tokens": 429419007.0, "step": 678 }, { "epoch": 0.08028851838713492, "grad_norm": 0.190629780292511, "learning_rate": 1.1352986072732943e-05, "loss": 0.3779, "num_tokens": 430051255.0, "step": 679 }, { "epoch": 0.08040676362776399, "grad_norm": 0.22666533291339874, "learning_rate": 1.1291245272005658e-05, "loss": 0.4233, "num_tokens": 430683994.0, "step": 680 }, { "epoch": 0.08052500886839305, "grad_norm": 0.2007281333208084, "learning_rate": 1.1229823894379133e-05, "loss": 0.3534, "num_tokens": 431314161.0, "step": 681 }, { "epoch": 0.08064325410902211, "grad_norm": 0.2027830183506012, "learning_rate": 1.1168722843608897e-05, "loss": 0.3763, "num_tokens": 431943550.0, "step": 682 }, { "epoch": 0.08076149934965117, "grad_norm": 0.18750514090061188, "learning_rate": 1.1107943018737158e-05, "loss": 0.3677, "num_tokens": 432580022.0, "step": 683 }, { "epoch": 0.08087974459028024, "grad_norm": 0.20712460577487946, "learning_rate": 1.104748531407962e-05, "loss": 0.4149, "num_tokens": 433215860.0, "step": 684 }, { "epoch": 0.0809979898309093, "grad_norm": 0.21580064296722412, "learning_rate": 1.0987350619212307e-05, "loss": 0.3697, "num_tokens": 433849766.0, "step": 685 }, { "epoch": 0.08111623507153837, "grad_norm": 0.16628190875053406, "learning_rate": 1.0927539818958437e-05, "loss": 0.348, "num_tokens": 434484743.0, "step": 686 }, { "epoch": 0.08123448031216743, "grad_norm": 0.2060742974281311, "learning_rate": 1.0868053793375467e-05, "loss": 0.3591, "num_tokens": 435100372.0, "step": 687 }, { "epoch": 0.0813527255527965, "grad_norm": 0.2135939598083496, "learning_rate": 1.0808893417742116e-05, "loss": 0.4258, "num_tokens": 435733891.0, "step": 688 }, { "epoch": 0.08147097079342557, "grad_norm": 0.1941777616739273, "learning_rate": 1.0750059562545451e-05, "loss": 0.3644, "num_tokens": 436365690.0, "step": 689 }, { "epoch": 0.08158921603405463, "grad_norm": 0.17885838449001312, "learning_rate": 1.0691553093468144e-05, "loss": 0.3508, "num_tokens": 437003639.0, "step": 690 }, { "epoch": 0.08170746127468369, "grad_norm": 0.18553341925144196, "learning_rate": 1.0633374871375666e-05, "loss": 0.3832, "num_tokens": 437642920.0, "step": 691 }, { "epoch": 0.08182570651531276, "grad_norm": 0.2075071483850479, "learning_rate": 1.0575525752303687e-05, "loss": 0.3829, "num_tokens": 438277063.0, "step": 692 }, { "epoch": 0.08194395175594182, "grad_norm": 0.21307510137557983, "learning_rate": 1.0518006587445431e-05, "loss": 0.3931, "num_tokens": 438915083.0, "step": 693 }, { "epoch": 0.08206219699657089, "grad_norm": 0.20583738386631012, "learning_rate": 1.0460818223139167e-05, "loss": 0.4053, "num_tokens": 439554233.0, "step": 694 }, { "epoch": 0.08218044223719995, "grad_norm": 0.17440171539783478, "learning_rate": 1.0403961500855766e-05, "loss": 0.359, "num_tokens": 440187716.0, "step": 695 }, { "epoch": 0.08229868747782902, "grad_norm": 0.1777043342590332, "learning_rate": 1.0347437257186311e-05, "loss": 0.3862, "num_tokens": 440823462.0, "step": 696 }, { "epoch": 0.08241693271845808, "grad_norm": 0.18520857393741608, "learning_rate": 1.0291246323829772e-05, "loss": 0.3751, "num_tokens": 441461261.0, "step": 697 }, { "epoch": 0.08253517795908714, "grad_norm": 0.2085760086774826, "learning_rate": 1.0235389527580807e-05, "loss": 0.3989, "num_tokens": 442092406.0, "step": 698 }, { "epoch": 0.0826534231997162, "grad_norm": 0.1899712234735489, "learning_rate": 1.0179867690317546e-05, "loss": 0.4033, "num_tokens": 442729228.0, "step": 699 }, { "epoch": 0.08277166844034528, "grad_norm": 0.19925859570503235, "learning_rate": 1.0124681628989546e-05, "loss": 0.416, "num_tokens": 443368453.0, "step": 700 }, { "epoch": 0.08288991368097434, "grad_norm": 0.1773071438074112, "learning_rate": 1.006983215560575e-05, "loss": 0.3633, "num_tokens": 444004993.0, "step": 701 }, { "epoch": 0.0830081589216034, "grad_norm": 0.20045937597751617, "learning_rate": 1.001532007722252e-05, "loss": 0.4294, "num_tokens": 444641198.0, "step": 702 }, { "epoch": 0.08312640416223246, "grad_norm": 0.18577006459236145, "learning_rate": 9.9611461959318e-06, "loss": 0.3833, "num_tokens": 445272456.0, "step": 703 }, { "epoch": 0.08324464940286154, "grad_norm": 0.21089830994606018, "learning_rate": 9.907311308849286e-06, "loss": 0.4268, "num_tokens": 445909612.0, "step": 704 }, { "epoch": 0.0833628946434906, "grad_norm": 0.22879935801029205, "learning_rate": 9.853816208102698e-06, "loss": 0.4456, "num_tokens": 446544323.0, "step": 705 }, { "epoch": 0.08348113988411966, "grad_norm": 0.1861100196838379, "learning_rate": 9.800661680820146e-06, "loss": 0.3963, "num_tokens": 447177697.0, "step": 706 }, { "epoch": 0.08359938512474872, "grad_norm": 0.21287429332733154, "learning_rate": 9.747848509118531e-06, "loss": 0.4048, "num_tokens": 447813578.0, "step": 707 }, { "epoch": 0.0837176303653778, "grad_norm": 0.19029271602630615, "learning_rate": 9.69537747009204e-06, "loss": 0.3696, "num_tokens": 448448247.0, "step": 708 }, { "epoch": 0.08383587560600686, "grad_norm": 0.19157418608665466, "learning_rate": 9.643249335800701e-06, "loss": 0.3907, "num_tokens": 449081260.0, "step": 709 }, { "epoch": 0.08395412084663592, "grad_norm": 0.1897335648536682, "learning_rate": 9.591464873259048e-06, "loss": 0.3519, "num_tokens": 449718960.0, "step": 710 }, { "epoch": 0.08407236608726498, "grad_norm": 0.20246022939682007, "learning_rate": 9.540024844424825e-06, "loss": 0.3647, "num_tokens": 450354221.0, "step": 711 }, { "epoch": 0.08419061132789406, "grad_norm": 0.22009633481502533, "learning_rate": 9.48893000618775e-06, "loss": 0.4162, "num_tokens": 450990864.0, "step": 712 }, { "epoch": 0.08430885656852312, "grad_norm": 0.17846493422985077, "learning_rate": 9.438181110358414e-06, "loss": 0.347, "num_tokens": 451629963.0, "step": 713 }, { "epoch": 0.08442710180915218, "grad_norm": 0.17807744443416595, "learning_rate": 9.387778903657208e-06, "loss": 0.3508, "num_tokens": 452263375.0, "step": 714 }, { "epoch": 0.08454534704978124, "grad_norm": 0.2217930108308792, "learning_rate": 9.337724127703315e-06, "loss": 0.4266, "num_tokens": 452899788.0, "step": 715 }, { "epoch": 0.08466359229041032, "grad_norm": 0.17611801624298096, "learning_rate": 9.288017519003827e-06, "loss": 0.3527, "num_tokens": 453532800.0, "step": 716 }, { "epoch": 0.08478183753103938, "grad_norm": 0.18967418372631073, "learning_rate": 9.2386598089429e-06, "loss": 0.4013, "num_tokens": 454167051.0, "step": 717 }, { "epoch": 0.08490008277166844, "grad_norm": 0.18361113965511322, "learning_rate": 9.189651723770968e-06, "loss": 0.3954, "num_tokens": 454801891.0, "step": 718 }, { "epoch": 0.0850183280122975, "grad_norm": 0.18380604684352875, "learning_rate": 9.140993984594098e-06, "loss": 0.3798, "num_tokens": 455434940.0, "step": 719 }, { "epoch": 0.08513657325292658, "grad_norm": 0.2047707885503769, "learning_rate": 9.092687307363336e-06, "loss": 0.4165, "num_tokens": 456070522.0, "step": 720 }, { "epoch": 0.08525481849355564, "grad_norm": 0.1952807605266571, "learning_rate": 9.044732402864214e-06, "loss": 0.4127, "num_tokens": 456700607.0, "step": 721 }, { "epoch": 0.0853730637341847, "grad_norm": 0.20445430278778076, "learning_rate": 8.997129976706273e-06, "loss": 0.3739, "num_tokens": 457333591.0, "step": 722 }, { "epoch": 0.08549130897481376, "grad_norm": 0.18014107644557953, "learning_rate": 8.949880729312658e-06, "loss": 0.3939, "num_tokens": 457972538.0, "step": 723 }, { "epoch": 0.08560955421544283, "grad_norm": 0.18680702149868011, "learning_rate": 8.902985355909854e-06, "loss": 0.3814, "num_tokens": 458608333.0, "step": 724 }, { "epoch": 0.0857277994560719, "grad_norm": 0.1889398694038391, "learning_rate": 8.856444546517439e-06, "loss": 0.3846, "num_tokens": 459238593.0, "step": 725 }, { "epoch": 0.08584604469670096, "grad_norm": 0.1750420778989792, "learning_rate": 8.810258985937902e-06, "loss": 0.3657, "num_tokens": 459848240.0, "step": 726 }, { "epoch": 0.08596428993733002, "grad_norm": 0.2158506065607071, "learning_rate": 8.764429353746627e-06, "loss": 0.4134, "num_tokens": 460483298.0, "step": 727 }, { "epoch": 0.0860825351779591, "grad_norm": 0.1906134933233261, "learning_rate": 8.71895632428183e-06, "loss": 0.3773, "num_tokens": 461109500.0, "step": 728 }, { "epoch": 0.08620078041858815, "grad_norm": 0.193019300699234, "learning_rate": 8.673840566634688e-06, "loss": 0.3787, "num_tokens": 461746594.0, "step": 729 }, { "epoch": 0.08631902565921722, "grad_norm": 0.18906846642494202, "learning_rate": 8.629082744639463e-06, "loss": 0.3829, "num_tokens": 462380799.0, "step": 730 }, { "epoch": 0.08643727089984628, "grad_norm": 0.18636515736579895, "learning_rate": 8.584683516863736e-06, "loss": 0.3875, "num_tokens": 463016862.0, "step": 731 }, { "epoch": 0.08655551614047535, "grad_norm": 0.17957797646522522, "learning_rate": 8.540643536598749e-06, "loss": 0.3563, "num_tokens": 463650306.0, "step": 732 }, { "epoch": 0.08667376138110441, "grad_norm": 0.181325301527977, "learning_rate": 8.496963451849745e-06, "loss": 0.3773, "num_tokens": 464282371.0, "step": 733 }, { "epoch": 0.08679200662173348, "grad_norm": 0.19648700952529907, "learning_rate": 8.453643905326459e-06, "loss": 0.3687, "num_tokens": 464918493.0, "step": 734 }, { "epoch": 0.08691025186236254, "grad_norm": 0.19785350561141968, "learning_rate": 8.410685534433676e-06, "loss": 0.3761, "num_tokens": 465551682.0, "step": 735 }, { "epoch": 0.0870284971029916, "grad_norm": 0.1787901520729065, "learning_rate": 8.368088971261814e-06, "loss": 0.3737, "num_tokens": 466189560.0, "step": 736 }, { "epoch": 0.08714674234362067, "grad_norm": 0.19740906357765198, "learning_rate": 8.32585484257766e-06, "loss": 0.3781, "num_tokens": 466826010.0, "step": 737 }, { "epoch": 0.08726498758424973, "grad_norm": 0.18967872858047485, "learning_rate": 8.28398376981511e-06, "loss": 0.3477, "num_tokens": 467461700.0, "step": 738 }, { "epoch": 0.0873832328248788, "grad_norm": 0.16891902685165405, "learning_rate": 8.242476369066072e-06, "loss": 0.3352, "num_tokens": 468097256.0, "step": 739 }, { "epoch": 0.08750147806550786, "grad_norm": 0.2073381245136261, "learning_rate": 8.20133325107137e-06, "loss": 0.4052, "num_tokens": 468731535.0, "step": 740 }, { "epoch": 0.08761972330613693, "grad_norm": 0.18397468328475952, "learning_rate": 8.160555021211748e-06, "loss": 0.3544, "num_tokens": 469363357.0, "step": 741 }, { "epoch": 0.087737968546766, "grad_norm": 0.19281727075576782, "learning_rate": 8.12014227949899e-06, "loss": 0.3782, "num_tokens": 469996228.0, "step": 742 }, { "epoch": 0.08785621378739505, "grad_norm": 0.20584794878959656, "learning_rate": 8.080095620567093e-06, "loss": 0.4069, "num_tokens": 470628575.0, "step": 743 }, { "epoch": 0.08797445902802412, "grad_norm": 0.18428972363471985, "learning_rate": 8.040415633663469e-06, "loss": 0.3892, "num_tokens": 471265485.0, "step": 744 }, { "epoch": 0.08809270426865319, "grad_norm": 0.1747061312198639, "learning_rate": 8.001102902640344e-06, "loss": 0.3767, "num_tokens": 471898145.0, "step": 745 }, { "epoch": 0.08821094950928225, "grad_norm": 0.18705062568187714, "learning_rate": 7.962158005946105e-06, "loss": 0.3754, "num_tokens": 472533209.0, "step": 746 }, { "epoch": 0.08832919474991131, "grad_norm": 0.18788328766822815, "learning_rate": 7.923581516616837e-06, "loss": 0.3855, "num_tokens": 473171790.0, "step": 747 }, { "epoch": 0.08844743999054037, "grad_norm": 0.18790322542190552, "learning_rate": 7.88537400226787e-06, "loss": 0.3487, "num_tokens": 473806600.0, "step": 748 }, { "epoch": 0.08856568523116945, "grad_norm": 0.18705305457115173, "learning_rate": 7.847536025085408e-06, "loss": 0.3834, "num_tokens": 474446221.0, "step": 749 }, { "epoch": 0.08868393047179851, "grad_norm": 0.1689257174730301, "learning_rate": 7.810068141818299e-06, "loss": 0.3533, "num_tokens": 475080946.0, "step": 750 }, { "epoch": 0.08880217571242757, "grad_norm": 0.18348811566829681, "learning_rate": 7.772970903769814e-06, "loss": 0.3248, "num_tokens": 475715589.0, "step": 751 }, { "epoch": 0.08892042095305663, "grad_norm": 0.194603830575943, "learning_rate": 7.736244856789531e-06, "loss": 0.3856, "num_tokens": 476350099.0, "step": 752 }, { "epoch": 0.08903866619368571, "grad_norm": 0.19097204506397247, "learning_rate": 7.69989054126533e-06, "loss": 0.3998, "num_tokens": 476986608.0, "step": 753 }, { "epoch": 0.08915691143431477, "grad_norm": 0.18063834309577942, "learning_rate": 7.663908492115426e-06, "loss": 0.3828, "num_tokens": 477626286.0, "step": 754 }, { "epoch": 0.08927515667494383, "grad_norm": 0.1803908348083496, "learning_rate": 7.628299238780476e-06, "loss": 0.3851, "num_tokens": 478262327.0, "step": 755 }, { "epoch": 0.08939340191557289, "grad_norm": 0.18068207800388336, "learning_rate": 7.59306330521584e-06, "loss": 0.3602, "num_tokens": 478899878.0, "step": 756 }, { "epoch": 0.08951164715620197, "grad_norm": 0.1799282282590866, "learning_rate": 7.558201209883818e-06, "loss": 0.3743, "num_tokens": 479538362.0, "step": 757 }, { "epoch": 0.08962989239683103, "grad_norm": 0.1710379421710968, "learning_rate": 7.523713465746072e-06, "loss": 0.3626, "num_tokens": 480177217.0, "step": 758 }, { "epoch": 0.08974813763746009, "grad_norm": 0.18254569172859192, "learning_rate": 7.489600580256027e-06, "loss": 0.3839, "num_tokens": 480809891.0, "step": 759 }, { "epoch": 0.08986638287808915, "grad_norm": 0.19266051054000854, "learning_rate": 7.455863055351445e-06, "loss": 0.3762, "num_tokens": 481446104.0, "step": 760 }, { "epoch": 0.08998462811871823, "grad_norm": 0.16768276691436768, "learning_rate": 7.422501387447021e-06, "loss": 0.3582, "num_tokens": 482084578.0, "step": 761 }, { "epoch": 0.09010287335934729, "grad_norm": 0.18206870555877686, "learning_rate": 7.389516067427073e-06, "loss": 0.3688, "num_tokens": 482713767.0, "step": 762 }, { "epoch": 0.09022111859997635, "grad_norm": 0.21701638400554657, "learning_rate": 7.356907580638336e-06, "loss": 0.436, "num_tokens": 483353280.0, "step": 763 }, { "epoch": 0.09033936384060541, "grad_norm": 0.15846391022205353, "learning_rate": 7.324676406882817e-06, "loss": 0.3657, "num_tokens": 483985107.0, "step": 764 }, { "epoch": 0.09045760908123449, "grad_norm": 0.21575696766376495, "learning_rate": 7.2928230204107194e-06, "loss": 0.3862, "num_tokens": 484615672.0, "step": 765 }, { "epoch": 0.09057585432186355, "grad_norm": 0.19652079045772552, "learning_rate": 7.261347889913485e-06, "loss": 0.3826, "num_tokens": 485253394.0, "step": 766 }, { "epoch": 0.09069409956249261, "grad_norm": 0.1919373869895935, "learning_rate": 7.230251478516881e-06, "loss": 0.3903, "num_tokens": 485886884.0, "step": 767 }, { "epoch": 0.09081234480312167, "grad_norm": 0.21124163269996643, "learning_rate": 7.199534243774199e-06, "loss": 0.3766, "num_tokens": 486516495.0, "step": 768 }, { "epoch": 0.09093059004375074, "grad_norm": 0.18964266777038574, "learning_rate": 7.169196637659522e-06, "loss": 0.4244, "num_tokens": 487151670.0, "step": 769 }, { "epoch": 0.0910488352843798, "grad_norm": 0.20490339398384094, "learning_rate": 7.139239106561053e-06, "loss": 0.3828, "num_tokens": 487786678.0, "step": 770 }, { "epoch": 0.09116708052500887, "grad_norm": 0.20041170716285706, "learning_rate": 7.109662091274574e-06, "loss": 0.3998, "num_tokens": 488423430.0, "step": 771 }, { "epoch": 0.09128532576563793, "grad_norm": 0.17842328548431396, "learning_rate": 7.080466026996954e-06, "loss": 0.3712, "num_tokens": 489055057.0, "step": 772 }, { "epoch": 0.091403571006267, "grad_norm": 0.18228091299533844, "learning_rate": 7.051651343319723e-06, "loss": 0.3632, "num_tokens": 489690318.0, "step": 773 }, { "epoch": 0.09152181624689606, "grad_norm": 0.19914202392101288, "learning_rate": 7.023218464222788e-06, "loss": 0.4109, "num_tokens": 490315503.0, "step": 774 }, { "epoch": 0.09164006148752513, "grad_norm": 0.1682100147008896, "learning_rate": 6.995167808068159e-06, "loss": 0.3356, "num_tokens": 490951658.0, "step": 775 }, { "epoch": 0.09175830672815419, "grad_norm": 0.18745863437652588, "learning_rate": 6.9674997875938175e-06, "loss": 0.3389, "num_tokens": 491582936.0, "step": 776 }, { "epoch": 0.09187655196878326, "grad_norm": 0.18999552726745605, "learning_rate": 6.940214809907637e-06, "loss": 0.4062, "num_tokens": 492221808.0, "step": 777 }, { "epoch": 0.09199479720941232, "grad_norm": 0.20237648487091064, "learning_rate": 6.913313276481378e-06, "loss": 0.3851, "num_tokens": 492851928.0, "step": 778 }, { "epoch": 0.09211304245004139, "grad_norm": 0.18820145726203918, "learning_rate": 6.886795583144813e-06, "loss": 0.3408, "num_tokens": 493484521.0, "step": 779 }, { "epoch": 0.09223128769067045, "grad_norm": 0.2060716450214386, "learning_rate": 6.860662120079868e-06, "loss": 0.4156, "num_tokens": 494120278.0, "step": 780 }, { "epoch": 0.09234953293129952, "grad_norm": 0.1769654005765915, "learning_rate": 6.834913271814898e-06, "loss": 0.4094, "num_tokens": 494748375.0, "step": 781 }, { "epoch": 0.09246777817192858, "grad_norm": 0.2025490701198578, "learning_rate": 6.809549417219036e-06, "loss": 0.3979, "num_tokens": 495383913.0, "step": 782 }, { "epoch": 0.09258602341255764, "grad_norm": 0.1910087913274765, "learning_rate": 6.784570929496596e-06, "loss": 0.3656, "num_tokens": 496008813.0, "step": 783 }, { "epoch": 0.0927042686531867, "grad_norm": 0.18994437158107758, "learning_rate": 6.759978176181609e-06, "loss": 0.3939, "num_tokens": 496639648.0, "step": 784 }, { "epoch": 0.09282251389381578, "grad_norm": 0.1858188956975937, "learning_rate": 6.7357715191323985e-06, "loss": 0.3416, "num_tokens": 497274171.0, "step": 785 }, { "epoch": 0.09294075913444484, "grad_norm": 0.17720621824264526, "learning_rate": 6.711951314526245e-06, "loss": 0.3714, "num_tokens": 497913138.0, "step": 786 }, { "epoch": 0.0930590043750739, "grad_norm": 0.18589583039283752, "learning_rate": 6.688517912854183e-06, "loss": 0.4066, "num_tokens": 498551639.0, "step": 787 }, { "epoch": 0.09317724961570296, "grad_norm": 0.19449126720428467, "learning_rate": 6.665471658915793e-06, "loss": 0.3974, "num_tokens": 499182979.0, "step": 788 }, { "epoch": 0.09329549485633203, "grad_norm": 0.20465314388275146, "learning_rate": 6.642812891814178e-06, "loss": 0.3752, "num_tokens": 499817574.0, "step": 789 }, { "epoch": 0.0934137400969611, "grad_norm": 0.20144398510456085, "learning_rate": 6.620541944950941e-06, "loss": 0.4221, "num_tokens": 500450987.0, "step": 790 }, { "epoch": 0.09353198533759016, "grad_norm": 0.20738765597343445, "learning_rate": 6.598659146021286e-06, "loss": 0.4083, "num_tokens": 501086255.0, "step": 791 }, { "epoch": 0.09365023057821922, "grad_norm": 0.18912115693092346, "learning_rate": 6.577164817009207e-06, "loss": 0.375, "num_tokens": 501724060.0, "step": 792 }, { "epoch": 0.09376847581884828, "grad_norm": 0.17531508207321167, "learning_rate": 6.556059274182744e-06, "loss": 0.3698, "num_tokens": 502336426.0, "step": 793 }, { "epoch": 0.09388672105947736, "grad_norm": 0.2189079225063324, "learning_rate": 6.535342828089317e-06, "loss": 0.4016, "num_tokens": 502970977.0, "step": 794 }, { "epoch": 0.09400496630010642, "grad_norm": 0.20233234763145447, "learning_rate": 6.515015783551183e-06, "loss": 0.332, "num_tokens": 503604914.0, "step": 795 }, { "epoch": 0.09412321154073548, "grad_norm": 0.19407616555690765, "learning_rate": 6.495078439660918e-06, "loss": 0.3673, "num_tokens": 504241729.0, "step": 796 }, { "epoch": 0.09424145678136454, "grad_norm": 0.19280032813549042, "learning_rate": 6.475531089777052e-06, "loss": 0.3671, "num_tokens": 504874311.0, "step": 797 }, { "epoch": 0.09435970202199362, "grad_norm": 0.1831720918416977, "learning_rate": 6.456374021519726e-06, "loss": 0.3887, "num_tokens": 505507864.0, "step": 798 }, { "epoch": 0.09447794726262268, "grad_norm": 0.19168098270893097, "learning_rate": 6.4376075167664654e-06, "loss": 0.3912, "num_tokens": 506141369.0, "step": 799 }, { "epoch": 0.09459619250325174, "grad_norm": 0.18384189903736115, "learning_rate": 6.419231851648044e-06, "loss": 0.3676, "num_tokens": 506777001.0, "step": 800 }, { "epoch": 0.0947144377438808, "grad_norm": 0.16950486600399017, "learning_rate": 6.401247296544408e-06, "loss": 0.3298, "num_tokens": 507406711.0, "step": 801 }, { "epoch": 0.09483268298450988, "grad_norm": 0.1815839558839798, "learning_rate": 6.383654116080699e-06, "loss": 0.3838, "num_tokens": 508042267.0, "step": 802 }, { "epoch": 0.09495092822513894, "grad_norm": 0.18775033950805664, "learning_rate": 6.366452569123366e-06, "loss": 0.3549, "num_tokens": 508675609.0, "step": 803 }, { "epoch": 0.095069173465768, "grad_norm": 0.17260177433490753, "learning_rate": 6.3496429087763535e-06, "loss": 0.3564, "num_tokens": 509312065.0, "step": 804 }, { "epoch": 0.09518741870639706, "grad_norm": 0.1801680028438568, "learning_rate": 6.333225382377383e-06, "loss": 0.3679, "num_tokens": 509946717.0, "step": 805 }, { "epoch": 0.09530566394702614, "grad_norm": 0.1752161681652069, "learning_rate": 6.3172002314943e-06, "loss": 0.3705, "num_tokens": 510583518.0, "step": 806 }, { "epoch": 0.0954239091876552, "grad_norm": 0.19134798645973206, "learning_rate": 6.30156769192153e-06, "loss": 0.3984, "num_tokens": 511221107.0, "step": 807 }, { "epoch": 0.09554215442828426, "grad_norm": 0.1874755322933197, "learning_rate": 6.286327993676615e-06, "loss": 0.3846, "num_tokens": 511860697.0, "step": 808 }, { "epoch": 0.09566039966891332, "grad_norm": 0.17865346372127533, "learning_rate": 6.271481360996808e-06, "loss": 0.3737, "num_tokens": 512498128.0, "step": 809 }, { "epoch": 0.0957786449095424, "grad_norm": 0.19846026599407196, "learning_rate": 6.257028012335795e-06, "loss": 0.4089, "num_tokens": 513128610.0, "step": 810 }, { "epoch": 0.09589689015017146, "grad_norm": 0.16993194818496704, "learning_rate": 6.2429681603604726e-06, "loss": 0.3392, "num_tokens": 513765105.0, "step": 811 }, { "epoch": 0.09601513539080052, "grad_norm": 0.17878930270671844, "learning_rate": 6.229302011947814e-06, "loss": 0.3964, "num_tokens": 514394034.0, "step": 812 }, { "epoch": 0.09613338063142958, "grad_norm": 0.18822607398033142, "learning_rate": 6.2160297681818316e-06, "loss": 0.3763, "num_tokens": 515033384.0, "step": 813 }, { "epoch": 0.09625162587205865, "grad_norm": 0.19209401309490204, "learning_rate": 6.2031516243506175e-06, "loss": 0.3585, "num_tokens": 515667789.0, "step": 814 }, { "epoch": 0.09636987111268772, "grad_norm": 0.19187025725841522, "learning_rate": 6.190667769943463e-06, "loss": 0.3625, "num_tokens": 516301878.0, "step": 815 }, { "epoch": 0.09648811635331678, "grad_norm": 0.17314016819000244, "learning_rate": 6.178578388648084e-06, "loss": 0.3548, "num_tokens": 516936923.0, "step": 816 }, { "epoch": 0.09660636159394584, "grad_norm": 0.19279181957244873, "learning_rate": 6.166883658347904e-06, "loss": 0.4, "num_tokens": 517574893.0, "step": 817 }, { "epoch": 0.09672460683457491, "grad_norm": 0.1702749878168106, "learning_rate": 6.155583751119448e-06, "loss": 0.3694, "num_tokens": 518213624.0, "step": 818 }, { "epoch": 0.09684285207520398, "grad_norm": 0.1792595386505127, "learning_rate": 6.1446788332298e-06, "loss": 0.3538, "num_tokens": 518852531.0, "step": 819 }, { "epoch": 0.09696109731583304, "grad_norm": 0.18162083625793457, "learning_rate": 6.134169065134162e-06, "loss": 0.3896, "num_tokens": 519492204.0, "step": 820 }, { "epoch": 0.0970793425564621, "grad_norm": 0.18663813173770905, "learning_rate": 6.124054601473502e-06, "loss": 0.3965, "num_tokens": 520130296.0, "step": 821 }, { "epoch": 0.09719758779709117, "grad_norm": 0.1922474354505539, "learning_rate": 6.114335591072261e-06, "loss": 0.3621, "num_tokens": 520765986.0, "step": 822 }, { "epoch": 0.09731583303772023, "grad_norm": 0.236387699842453, "learning_rate": 6.105012176936177e-06, "loss": 0.4225, "num_tokens": 521400644.0, "step": 823 }, { "epoch": 0.0974340782783493, "grad_norm": 0.17774070799350739, "learning_rate": 6.096084496250168e-06, "loss": 0.364, "num_tokens": 522039463.0, "step": 824 }, { "epoch": 0.09755232351897836, "grad_norm": 0.18863226473331451, "learning_rate": 6.087552680376332e-06, "loss": 0.3668, "num_tokens": 522671508.0, "step": 825 }, { "epoch": 0.09767056875960743, "grad_norm": 0.19288307428359985, "learning_rate": 6.079416854851993e-06, "loss": 0.3596, "num_tokens": 523311225.0, "step": 826 }, { "epoch": 0.0977888140002365, "grad_norm": 0.1993461400270462, "learning_rate": 6.071677139387874e-06, "loss": 0.3414, "num_tokens": 523949133.0, "step": 827 }, { "epoch": 0.09790705924086555, "grad_norm": 0.18140719830989838, "learning_rate": 6.064333647866317e-06, "loss": 0.3793, "num_tokens": 524577955.0, "step": 828 }, { "epoch": 0.09802530448149462, "grad_norm": 0.18989813327789307, "learning_rate": 6.057386488339618e-06, "loss": 0.3784, "num_tokens": 525211514.0, "step": 829 }, { "epoch": 0.09814354972212369, "grad_norm": 0.18679462373256683, "learning_rate": 6.050835763028446e-06, "loss": 0.4006, "num_tokens": 525848086.0, "step": 830 }, { "epoch": 0.09826179496275275, "grad_norm": 0.17804615199565887, "learning_rate": 6.04468156832031e-06, "loss": 0.3619, "num_tokens": 526481722.0, "step": 831 }, { "epoch": 0.09838004020338181, "grad_norm": 0.1832081377506256, "learning_rate": 6.038923994768173e-06, "loss": 0.3818, "num_tokens": 527117956.0, "step": 832 }, { "epoch": 0.09849828544401087, "grad_norm": 0.20609410107135773, "learning_rate": 6.033563127089097e-06, "loss": 0.4023, "num_tokens": 527750234.0, "step": 833 }, { "epoch": 0.09861653068463995, "grad_norm": 0.201175257563591, "learning_rate": 6.02859904416301e-06, "loss": 0.3745, "num_tokens": 528386561.0, "step": 834 }, { "epoch": 0.09873477592526901, "grad_norm": 0.20368118584156036, "learning_rate": 6.024031819031541e-06, "loss": 0.4117, "num_tokens": 529021750.0, "step": 835 }, { "epoch": 0.09885302116589807, "grad_norm": 0.18870466947555542, "learning_rate": 6.019861518896941e-06, "loss": 0.3533, "num_tokens": 529661276.0, "step": 836 }, { "epoch": 0.09897126640652713, "grad_norm": 0.2020527869462967, "learning_rate": 6.016088205121099e-06, "loss": 0.3947, "num_tokens": 530297609.0, "step": 837 }, { "epoch": 0.09908951164715621, "grad_norm": 0.18172025680541992, "learning_rate": 6.012711933224636e-06, "loss": 0.3672, "num_tokens": 530933315.0, "step": 838 }, { "epoch": 0.09920775688778527, "grad_norm": 0.1858338862657547, "learning_rate": 6.009732752886096e-06, "loss": 0.381, "num_tokens": 531564788.0, "step": 839 }, { "epoch": 0.09932600212841433, "grad_norm": 0.18906207382678986, "learning_rate": 6.0071507079412e-06, "loss": 0.384, "num_tokens": 532193430.0, "step": 840 }, { "epoch": 0.09944424736904339, "grad_norm": 0.1974787414073944, "learning_rate": 6.004965836382215e-06, "loss": 0.3912, "num_tokens": 532828601.0, "step": 841 }, { "epoch": 0.09956249260967245, "grad_norm": 0.18472707271575928, "learning_rate": 6.003178170357397e-06, "loss": 0.3508, "num_tokens": 533466099.0, "step": 842 }, { "epoch": 0.09968073785030153, "grad_norm": 0.17779730260372162, "learning_rate": 6.001787736170496e-06, "loss": 0.3865, "num_tokens": 534102611.0, "step": 843 }, { "epoch": 0.09979898309093059, "grad_norm": 0.19053196907043457, "learning_rate": 6.000794554280395e-06, "loss": 0.3733, "num_tokens": 534731488.0, "step": 844 }, { "epoch": 0.09991722833155965, "grad_norm": 0.1939527839422226, "learning_rate": 6.0001986393007945e-06, "loss": 0.3785, "num_tokens": 535370116.0, "step": 845 } ], "logging_steps": 1.0, "max_steps": 845, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 845, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.527336137257124e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }