| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.09991722833155965, |
| "eval_steps": 500, |
| "global_step": 845, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00011824524062906468, |
| "grad_norm": 1.7523608207702637, |
| "learning_rate": 0.0, |
| "loss": 0.9062, |
| "num_tokens": 628048.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00023649048125812936, |
| "grad_norm": 1.6680941581726074, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 0.8713, |
| "num_tokens": 1266689.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000354735721887194, |
| "grad_norm": 1.625159502029419, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.873, |
| "num_tokens": 1900338.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0004729809625162587, |
| "grad_norm": 1.3783349990844727, |
| "learning_rate": 6.923076923076923e-06, |
| "loss": 0.8628, |
| "num_tokens": 2539095.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0005912262031453234, |
| "grad_norm": 1.1593743562698364, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 0.8533, |
| "num_tokens": 3172740.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.000709471443774388, |
| "grad_norm": 1.1372928619384766, |
| "learning_rate": 1.153846153846154e-05, |
| "loss": 0.8015, |
| "num_tokens": 3810090.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0008277166844034528, |
| "grad_norm": 1.4683241844177246, |
| "learning_rate": 1.3846153846153847e-05, |
| "loss": 0.8613, |
| "num_tokens": 4447913.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0009459619250325174, |
| "grad_norm": 1.2824925184249878, |
| "learning_rate": 1.6153846153846154e-05, |
| "loss": 0.7549, |
| "num_tokens": 5085253.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001064207165661582, |
| "grad_norm": 0.9277980923652649, |
| "learning_rate": 1.8461538461538465e-05, |
| "loss": 0.7863, |
| "num_tokens": 5721930.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0011824524062906468, |
| "grad_norm": 1.0162955522537231, |
| "learning_rate": 2.076923076923077e-05, |
| "loss": 0.755, |
| "num_tokens": 6354428.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0013006976469197116, |
| "grad_norm": 0.8888687491416931, |
| "learning_rate": 2.307692307692308e-05, |
| "loss": 0.7388, |
| "num_tokens": 6988898.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.001418942887548776, |
| "grad_norm": 0.722545862197876, |
| "learning_rate": 2.5384615384615386e-05, |
| "loss": 0.7017, |
| "num_tokens": 7627058.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0015371881281778408, |
| "grad_norm": 0.7729371786117554, |
| "learning_rate": 2.7692307692307694e-05, |
| "loss": 0.6836, |
| "num_tokens": 8261730.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0016554333688069056, |
| "grad_norm": 0.6504688858985901, |
| "learning_rate": 3e-05, |
| "loss": 0.6866, |
| "num_tokens": 8889249.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0017736786094359701, |
| "grad_norm": 0.6326490640640259, |
| "learning_rate": 3.230769230769231e-05, |
| "loss": 0.6229, |
| "num_tokens": 9524710.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0018919238500650349, |
| "grad_norm": 0.6311523914337158, |
| "learning_rate": 3.461538461538461e-05, |
| "loss": 0.6436, |
| "num_tokens": 10161723.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0020101690906940994, |
| "grad_norm": 2.1880621910095215, |
| "learning_rate": 3.692307692307693e-05, |
| "loss": 0.6368, |
| "num_tokens": 10793622.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.002128414331323164, |
| "grad_norm": 0.766629695892334, |
| "learning_rate": 3.923076923076923e-05, |
| "loss": 0.64, |
| "num_tokens": 11420435.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002246659571952229, |
| "grad_norm": 0.6217833161354065, |
| "learning_rate": 4.153846153846154e-05, |
| "loss": 0.6326, |
| "num_tokens": 12055174.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0023649048125812936, |
| "grad_norm": 0.5231024026870728, |
| "learning_rate": 4.384615384615385e-05, |
| "loss": 0.5653, |
| "num_tokens": 12687424.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0024831500532103584, |
| "grad_norm": 0.5611249804496765, |
| "learning_rate": 4.615384615384616e-05, |
| "loss": 0.606, |
| "num_tokens": 13318953.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.002601395293839423, |
| "grad_norm": 0.5640860199928284, |
| "learning_rate": 4.846153846153846e-05, |
| "loss": 0.5724, |
| "num_tokens": 13937259.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0027196405344684874, |
| "grad_norm": 0.48454275727272034, |
| "learning_rate": 5.076923076923077e-05, |
| "loss": 0.5994, |
| "num_tokens": 14569053.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.002837885775097552, |
| "grad_norm": 0.6201558113098145, |
| "learning_rate": 5.3076923076923076e-05, |
| "loss": 0.6035, |
| "num_tokens": 15199901.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.002956131015726617, |
| "grad_norm": 0.773175060749054, |
| "learning_rate": 5.538461538461539e-05, |
| "loss": 0.5975, |
| "num_tokens": 15830276.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0030743762563556817, |
| "grad_norm": 0.5781369209289551, |
| "learning_rate": 5.76923076923077e-05, |
| "loss": 0.5864, |
| "num_tokens": 16465223.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0031926214969847464, |
| "grad_norm": 0.5451337695121765, |
| "learning_rate": 6e-05, |
| "loss": 0.5692, |
| "num_tokens": 17102309.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.003310866737613811, |
| "grad_norm": 0.48841163516044617, |
| "learning_rate": 5.9999801360699206e-05, |
| "loss": 0.5736, |
| "num_tokens": 17738787.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003429111978242876, |
| "grad_norm": 0.43556851148605347, |
| "learning_rate": 5.9999205445719606e-05, |
| "loss": 0.5085, |
| "num_tokens": 18370503.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.0035473572188719402, |
| "grad_norm": 0.4315873682498932, |
| "learning_rate": 5.999821226382951e-05, |
| "loss": 0.5342, |
| "num_tokens": 19008700.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003665602459501005, |
| "grad_norm": 0.48499879240989685, |
| "learning_rate": 5.99968218296426e-05, |
| "loss": 0.5414, |
| "num_tokens": 19641076.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0037838477001300697, |
| "grad_norm": 0.3707197308540344, |
| "learning_rate": 5.999503416361778e-05, |
| "loss": 0.4694, |
| "num_tokens": 20268470.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.0039020929407591345, |
| "grad_norm": 0.4602040946483612, |
| "learning_rate": 5.99928492920588e-05, |
| "loss": 0.545, |
| "num_tokens": 20903791.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.004020338181388199, |
| "grad_norm": 0.4377839267253876, |
| "learning_rate": 5.999026724711391e-05, |
| "loss": 0.5273, |
| "num_tokens": 21537889.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.0041385834220172635, |
| "grad_norm": 0.3558352291584015, |
| "learning_rate": 5.998728806677537e-05, |
| "loss": 0.4575, |
| "num_tokens": 22169163.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.004256828662646328, |
| "grad_norm": 0.4064357280731201, |
| "learning_rate": 5.99839117948789e-05, |
| "loss": 0.5139, |
| "num_tokens": 22802993.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004375073903275393, |
| "grad_norm": 0.40676349401474, |
| "learning_rate": 5.998013848110306e-05, |
| "loss": 0.4923, |
| "num_tokens": 23436910.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.004493319143904458, |
| "grad_norm": 0.4407147765159607, |
| "learning_rate": 5.997596818096846e-05, |
| "loss": 0.5295, |
| "num_tokens": 24066237.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0046115643845335225, |
| "grad_norm": 0.42400693893432617, |
| "learning_rate": 5.997140095583699e-05, |
| "loss": 0.4883, |
| "num_tokens": 24702070.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.004729809625162587, |
| "grad_norm": 0.36618033051490784, |
| "learning_rate": 5.99664368729109e-05, |
| "loss": 0.4745, |
| "num_tokens": 25335554.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.004848054865791652, |
| "grad_norm": 0.32382911443710327, |
| "learning_rate": 5.996107600523183e-05, |
| "loss": 0.4362, |
| "num_tokens": 25967347.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.004966300106420717, |
| "grad_norm": 0.39235326647758484, |
| "learning_rate": 5.995531843167969e-05, |
| "loss": 0.4558, |
| "num_tokens": 26599914.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0050845453470497815, |
| "grad_norm": 0.3560352921485901, |
| "learning_rate": 5.9949164236971555e-05, |
| "loss": 0.5103, |
| "num_tokens": 27231412.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.005202790587678846, |
| "grad_norm": 0.4623904526233673, |
| "learning_rate": 5.994261351166038e-05, |
| "loss": 0.5168, |
| "num_tokens": 27868949.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005321035828307911, |
| "grad_norm": 0.3559505343437195, |
| "learning_rate": 5.99356663521337e-05, |
| "loss": 0.4952, |
| "num_tokens": 28505673.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.005439281068936975, |
| "grad_norm": 0.4067099690437317, |
| "learning_rate": 5.9928322860612126e-05, |
| "loss": 0.5023, |
| "num_tokens": 29143956.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.00555752630956604, |
| "grad_norm": 0.40066882967948914, |
| "learning_rate": 5.992058314514801e-05, |
| "loss": 0.5023, |
| "num_tokens": 29780377.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.005675771550195104, |
| "grad_norm": 0.3772350251674652, |
| "learning_rate": 5.9912447319623676e-05, |
| "loss": 0.5043, |
| "num_tokens": 30419988.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.005794016790824169, |
| "grad_norm": 0.36861610412597656, |
| "learning_rate": 5.9903915503749835e-05, |
| "loss": 0.4962, |
| "num_tokens": 31058641.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.005912262031453234, |
| "grad_norm": 0.36637431383132935, |
| "learning_rate": 5.989498782306382e-05, |
| "loss": 0.4995, |
| "num_tokens": 31696113.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006030507272082299, |
| "grad_norm": 0.432847797870636, |
| "learning_rate": 5.9885664408927744e-05, |
| "loss": 0.5389, |
| "num_tokens": 32335026.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.006148752512711363, |
| "grad_norm": 0.37941452860832214, |
| "learning_rate": 5.98759453985265e-05, |
| "loss": 0.4635, |
| "num_tokens": 32968147.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006266997753340428, |
| "grad_norm": 0.4316914677619934, |
| "learning_rate": 5.9865830934865846e-05, |
| "loss": 0.5001, |
| "num_tokens": 33599687.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.006385242993969493, |
| "grad_norm": 0.36900049448013306, |
| "learning_rate": 5.98553211667702e-05, |
| "loss": 0.4727, |
| "num_tokens": 34235952.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006503488234598558, |
| "grad_norm": 0.39060965180397034, |
| "learning_rate": 5.9844416248880556e-05, |
| "loss": 0.4985, |
| "num_tokens": 34855614.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.006621733475227622, |
| "grad_norm": 0.4554467499256134, |
| "learning_rate": 5.983311634165209e-05, |
| "loss": 0.5408, |
| "num_tokens": 35490773.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.006739978715856687, |
| "grad_norm": 0.3941882252693176, |
| "learning_rate": 5.982142161135191e-05, |
| "loss": 0.5216, |
| "num_tokens": 36118336.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.006858223956485752, |
| "grad_norm": 0.39384809136390686, |
| "learning_rate": 5.9809332230056545e-05, |
| "loss": 0.4929, |
| "num_tokens": 36751911.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.0069764691971148166, |
| "grad_norm": 0.38750314712524414, |
| "learning_rate": 5.979684837564939e-05, |
| "loss": 0.4889, |
| "num_tokens": 37375413.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0070947144377438804, |
| "grad_norm": 0.387765496969223, |
| "learning_rate": 5.978397023181817e-05, |
| "loss": 0.4611, |
| "num_tokens": 38006888.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007212959678372945, |
| "grad_norm": 0.36486542224884033, |
| "learning_rate": 5.977069798805219e-05, |
| "loss": 0.4789, |
| "num_tokens": 38640497.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00733120491900201, |
| "grad_norm": 0.32066309452056885, |
| "learning_rate": 5.975703183963953e-05, |
| "loss": 0.4666, |
| "num_tokens": 39273313.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007449450159631075, |
| "grad_norm": 0.4485960304737091, |
| "learning_rate": 5.97429719876642e-05, |
| "loss": 0.5988, |
| "num_tokens": 39910620.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.007567695400260139, |
| "grad_norm": 0.45505833625793457, |
| "learning_rate": 5.97285186390032e-05, |
| "loss": 0.4779, |
| "num_tokens": 40546868.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.007685940640889204, |
| "grad_norm": 0.3256620168685913, |
| "learning_rate": 5.9713672006323386e-05, |
| "loss": 0.4478, |
| "num_tokens": 41182518.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.007804185881518269, |
| "grad_norm": 0.4429851770401001, |
| "learning_rate": 5.969843230807847e-05, |
| "loss": 0.4945, |
| "num_tokens": 41817083.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.007922431122147333, |
| "grad_norm": 0.4284612536430359, |
| "learning_rate": 5.96827997685057e-05, |
| "loss": 0.4906, |
| "num_tokens": 42454047.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.008040676362776398, |
| "grad_norm": 0.41103261709213257, |
| "learning_rate": 5.966677461762262e-05, |
| "loss": 0.4699, |
| "num_tokens": 43066339.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008158921603405462, |
| "grad_norm": 0.41535407304763794, |
| "learning_rate": 5.965035709122364e-05, |
| "loss": 0.516, |
| "num_tokens": 43703254.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.008277166844034527, |
| "grad_norm": 0.41889598965644836, |
| "learning_rate": 5.963354743087664e-05, |
| "loss": 0.5109, |
| "num_tokens": 44339105.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008395412084663592, |
| "grad_norm": 0.37383216619491577, |
| "learning_rate": 5.9616345883919304e-05, |
| "loss": 0.4497, |
| "num_tokens": 44969251.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.008513657325292657, |
| "grad_norm": 0.39339086413383484, |
| "learning_rate": 5.9598752703455596e-05, |
| "loss": 0.4967, |
| "num_tokens": 45605957.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.008631902565921721, |
| "grad_norm": 0.34155574440956116, |
| "learning_rate": 5.958076814835196e-05, |
| "loss": 0.4478, |
| "num_tokens": 46242216.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.008750147806550786, |
| "grad_norm": 0.40994498133659363, |
| "learning_rate": 5.956239248323354e-05, |
| "loss": 0.4974, |
| "num_tokens": 46879736.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.00886839304717985, |
| "grad_norm": 0.34924760460853577, |
| "learning_rate": 5.9543625978480276e-05, |
| "loss": 0.4551, |
| "num_tokens": 47508876.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.008986638287808916, |
| "grad_norm": 0.3592020869255066, |
| "learning_rate": 5.952446891022294e-05, |
| "loss": 0.4589, |
| "num_tokens": 48148110.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.00910488352843798, |
| "grad_norm": 0.3335554003715515, |
| "learning_rate": 5.9504921560339085e-05, |
| "loss": 0.4415, |
| "num_tokens": 48779111.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.009223128769067045, |
| "grad_norm": 0.33642348647117615, |
| "learning_rate": 5.948498421644883e-05, |
| "loss": 0.4479, |
| "num_tokens": 49414520.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.00934137400969611, |
| "grad_norm": 0.3461398184299469, |
| "learning_rate": 5.9464657171910686e-05, |
| "loss": 0.4697, |
| "num_tokens": 50047364.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.009459619250325174, |
| "grad_norm": 0.35207217931747437, |
| "learning_rate": 5.944394072581726e-05, |
| "loss": 0.4365, |
| "num_tokens": 50679909.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00957786449095424, |
| "grad_norm": 0.3526061177253723, |
| "learning_rate": 5.9422835182990794e-05, |
| "loss": 0.447, |
| "num_tokens": 51313449.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.009696109731583304, |
| "grad_norm": 0.3391474783420563, |
| "learning_rate": 5.940134085397872e-05, |
| "loss": 0.4642, |
| "num_tokens": 51949695.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.009814354972212369, |
| "grad_norm": 0.3502749800682068, |
| "learning_rate": 5.937945805504906e-05, |
| "loss": 0.4723, |
| "num_tokens": 52582348.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.009932600212841433, |
| "grad_norm": 0.35535070300102234, |
| "learning_rate": 5.9357187108185826e-05, |
| "loss": 0.4752, |
| "num_tokens": 53211571.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010050845453470498, |
| "grad_norm": 0.3651171922683716, |
| "learning_rate": 5.933452834108421e-05, |
| "loss": 0.4694, |
| "num_tokens": 53846179.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.010169090694099563, |
| "grad_norm": 0.32200363278388977, |
| "learning_rate": 5.931148208714582e-05, |
| "loss": 0.4597, |
| "num_tokens": 54478244.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010287335934728628, |
| "grad_norm": 0.35413858294487, |
| "learning_rate": 5.9288048685473756e-05, |
| "loss": 0.4795, |
| "num_tokens": 55113336.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.010405581175357692, |
| "grad_norm": 0.28715524077415466, |
| "learning_rate": 5.92642284808676e-05, |
| "loss": 0.4432, |
| "num_tokens": 55750901.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.010523826415986757, |
| "grad_norm": 0.3725243806838989, |
| "learning_rate": 5.924002182381839e-05, |
| "loss": 0.5214, |
| "num_tokens": 56387320.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.010642071656615822, |
| "grad_norm": 0.3085726499557495, |
| "learning_rate": 5.9215429070503406e-05, |
| "loss": 0.4465, |
| "num_tokens": 57023503.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.010760316897244887, |
| "grad_norm": 0.3731476366519928, |
| "learning_rate": 5.9190450582780974e-05, |
| "loss": 0.5066, |
| "num_tokens": 57651196.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.01087856213787395, |
| "grad_norm": 0.32896849513053894, |
| "learning_rate": 5.9165086728185106e-05, |
| "loss": 0.4651, |
| "num_tokens": 58290170.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.010996807378503014, |
| "grad_norm": 0.29874181747436523, |
| "learning_rate": 5.913933787992013e-05, |
| "loss": 0.4323, |
| "num_tokens": 58929585.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01111505261913208, |
| "grad_norm": 0.3025204539299011, |
| "learning_rate": 5.9113204416855196e-05, |
| "loss": 0.4362, |
| "num_tokens": 59569034.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011233297859761144, |
| "grad_norm": 0.3040831685066223, |
| "learning_rate": 5.908668672351862e-05, |
| "loss": 0.4681, |
| "num_tokens": 60197509.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.011351543100390209, |
| "grad_norm": 0.33227190375328064, |
| "learning_rate": 5.9059785190092366e-05, |
| "loss": 0.4445, |
| "num_tokens": 60830564.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.011469788341019273, |
| "grad_norm": 0.3173273503780365, |
| "learning_rate": 5.9032500212406184e-05, |
| "loss": 0.4706, |
| "num_tokens": 61466570.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.011588033581648338, |
| "grad_norm": 0.3499050438404083, |
| "learning_rate": 5.900483219193184e-05, |
| "loss": 0.4474, |
| "num_tokens": 62100797.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.011706278822277403, |
| "grad_norm": 0.29081398248672485, |
| "learning_rate": 5.8976781535777215e-05, |
| "loss": 0.4548, |
| "num_tokens": 62734274.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.011824524062906468, |
| "grad_norm": 0.3550204634666443, |
| "learning_rate": 5.894834865668028e-05, |
| "loss": 0.4781, |
| "num_tokens": 63350637.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.011942769303535532, |
| "grad_norm": 0.3122808635234833, |
| "learning_rate": 5.891953397300305e-05, |
| "loss": 0.4562, |
| "num_tokens": 63989248.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.012061014544164597, |
| "grad_norm": 0.3456708490848541, |
| "learning_rate": 5.889033790872542e-05, |
| "loss": 0.4657, |
| "num_tokens": 64623402.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012179259784793662, |
| "grad_norm": 0.30247852206230164, |
| "learning_rate": 5.886076089343895e-05, |
| "loss": 0.42, |
| "num_tokens": 65263084.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.012297505025422727, |
| "grad_norm": 0.34775105118751526, |
| "learning_rate": 5.883080336234049e-05, |
| "loss": 0.4833, |
| "num_tokens": 65895544.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.012415750266051791, |
| "grad_norm": 0.35499584674835205, |
| "learning_rate": 5.88004657562258e-05, |
| "loss": 0.4397, |
| "num_tokens": 66526292.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.012533995506680856, |
| "grad_norm": 0.29378530383110046, |
| "learning_rate": 5.876974852148312e-05, |
| "loss": 0.455, |
| "num_tokens": 67163008.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.012652240747309921, |
| "grad_norm": 0.32384178042411804, |
| "learning_rate": 5.873865211008652e-05, |
| "loss": 0.45, |
| "num_tokens": 67799173.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.012770485987938986, |
| "grad_norm": 0.3031487762928009, |
| "learning_rate": 5.870717697958928e-05, |
| "loss": 0.431, |
| "num_tokens": 68433626.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.01288873122856805, |
| "grad_norm": 0.3422238230705261, |
| "learning_rate": 5.867532359311718e-05, |
| "loss": 0.462, |
| "num_tokens": 69071597.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.013006976469197115, |
| "grad_norm": 0.36208781599998474, |
| "learning_rate": 5.864309241936167e-05, |
| "loss": 0.4841, |
| "num_tokens": 69708272.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.01312522170982618, |
| "grad_norm": 0.35731053352355957, |
| "learning_rate": 5.861048393257293e-05, |
| "loss": 0.4707, |
| "num_tokens": 70309426.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.013243466950455245, |
| "grad_norm": 0.42830735445022583, |
| "learning_rate": 5.8577498612552985e-05, |
| "loss": 0.4905, |
| "num_tokens": 70946347.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.01336171219108431, |
| "grad_norm": 0.33078286051750183, |
| "learning_rate": 5.8544136944648554e-05, |
| "loss": 0.4294, |
| "num_tokens": 71578069.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.013479957431713374, |
| "grad_norm": 0.31700757145881653, |
| "learning_rate": 5.851039941974397e-05, |
| "loss": 0.4321, |
| "num_tokens": 72216733.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.013598202672342439, |
| "grad_norm": 0.3752131462097168, |
| "learning_rate": 5.8476286534253925e-05, |
| "loss": 0.4585, |
| "num_tokens": 72844928.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.013716447912971504, |
| "grad_norm": 0.29715201258659363, |
| "learning_rate": 5.844179879011618e-05, |
| "loss": 0.4574, |
| "num_tokens": 73482837.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.013834693153600568, |
| "grad_norm": 0.30245885252952576, |
| "learning_rate": 5.8406936694784165e-05, |
| "loss": 0.4828, |
| "num_tokens": 74118196.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.013952938394229633, |
| "grad_norm": 0.29638686776161194, |
| "learning_rate": 5.8371700761219527e-05, |
| "loss": 0.4263, |
| "num_tokens": 74756174.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014071183634858696, |
| "grad_norm": 0.3421514928340912, |
| "learning_rate": 5.833609150788458e-05, |
| "loss": 0.4882, |
| "num_tokens": 75393367.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.014189428875487761, |
| "grad_norm": 0.3114563226699829, |
| "learning_rate": 5.830010945873467e-05, |
| "loss": 0.4346, |
| "num_tokens": 76025875.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.014307674116116826, |
| "grad_norm": 0.29460081458091736, |
| "learning_rate": 5.826375514321047e-05, |
| "loss": 0.4155, |
| "num_tokens": 76657710.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01442591935674589, |
| "grad_norm": 0.34313178062438965, |
| "learning_rate": 5.8227029096230196e-05, |
| "loss": 0.4563, |
| "num_tokens": 77289318.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.014544164597374955, |
| "grad_norm": 0.3677009046077728, |
| "learning_rate": 5.81899318581817e-05, |
| "loss": 0.4417, |
| "num_tokens": 77923482.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.01466240983800402, |
| "grad_norm": 0.3275640606880188, |
| "learning_rate": 5.8152463974914595e-05, |
| "loss": 0.4607, |
| "num_tokens": 78551959.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.014780655078633085, |
| "grad_norm": 0.37022823095321655, |
| "learning_rate": 5.811462599773214e-05, |
| "loss": 0.4506, |
| "num_tokens": 79181459.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01489890031926215, |
| "grad_norm": 0.31386008858680725, |
| "learning_rate": 5.807641848338316e-05, |
| "loss": 0.4194, |
| "num_tokens": 79816398.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015017145559891214, |
| "grad_norm": 0.3229714632034302, |
| "learning_rate": 5.80378419940539e-05, |
| "loss": 0.4543, |
| "num_tokens": 80451198.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.015135390800520279, |
| "grad_norm": 0.33021923899650574, |
| "learning_rate": 5.799889709735966e-05, |
| "loss": 0.4601, |
| "num_tokens": 81087693.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.015253636041149344, |
| "grad_norm": 0.2887071371078491, |
| "learning_rate": 5.7959584366336535e-05, |
| "loss": 0.4132, |
| "num_tokens": 81722590.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.015371881281778408, |
| "grad_norm": 0.36038845777511597, |
| "learning_rate": 5.7919904379432913e-05, |
| "loss": 0.5152, |
| "num_tokens": 82358511.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.015490126522407473, |
| "grad_norm": 0.32132768630981445, |
| "learning_rate": 5.787985772050101e-05, |
| "loss": 0.4346, |
| "num_tokens": 82997292.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.015608371763036538, |
| "grad_norm": 0.2985667884349823, |
| "learning_rate": 5.783944497878826e-05, |
| "loss": 0.4244, |
| "num_tokens": 83636002.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.0157266170036656, |
| "grad_norm": 0.30603644251823425, |
| "learning_rate": 5.7798666748928636e-05, |
| "loss": 0.4487, |
| "num_tokens": 84266256.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.015844862244294666, |
| "grad_norm": 0.3081704378128052, |
| "learning_rate": 5.775752363093394e-05, |
| "loss": 0.4649, |
| "num_tokens": 84901903.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.01596310748492373, |
| "grad_norm": 0.29722145199775696, |
| "learning_rate": 5.7716016230184895e-05, |
| "loss": 0.4351, |
| "num_tokens": 85532297.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.016081352725552795, |
| "grad_norm": 0.27900344133377075, |
| "learning_rate": 5.767414515742235e-05, |
| "loss": 0.3898, |
| "num_tokens": 86159004.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.01619959796618186, |
| "grad_norm": 0.2939743995666504, |
| "learning_rate": 5.7631911028738184e-05, |
| "loss": 0.4395, |
| "num_tokens": 86791668.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.016317843206810925, |
| "grad_norm": 0.3190593421459198, |
| "learning_rate": 5.7589314465566326e-05, |
| "loss": 0.4502, |
| "num_tokens": 87415500.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.01643608844743999, |
| "grad_norm": 0.29683569073677063, |
| "learning_rate": 5.7546356094673545e-05, |
| "loss": 0.4181, |
| "num_tokens": 88054250.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.016554333688069054, |
| "grad_norm": 0.295808345079422, |
| "learning_rate": 5.750303654815026e-05, |
| "loss": 0.4011, |
| "num_tokens": 88683640.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01667257892869812, |
| "grad_norm": 0.33235254883766174, |
| "learning_rate": 5.745935646340125e-05, |
| "loss": 0.4017, |
| "num_tokens": 89322994.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.016790824169327184, |
| "grad_norm": 0.30324289202690125, |
| "learning_rate": 5.7415316483136266e-05, |
| "loss": 0.4486, |
| "num_tokens": 89959870.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.01690906940995625, |
| "grad_norm": 0.3970086872577667, |
| "learning_rate": 5.737091725536055e-05, |
| "loss": 0.4515, |
| "num_tokens": 90595155.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.017027314650585313, |
| "grad_norm": 0.27713295817375183, |
| "learning_rate": 5.732615943336531e-05, |
| "loss": 0.4523, |
| "num_tokens": 91229434.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.017145559891214378, |
| "grad_norm": 0.31949537992477417, |
| "learning_rate": 5.7281043675718176e-05, |
| "loss": 0.423, |
| "num_tokens": 91864729.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.017263805131843443, |
| "grad_norm": 0.2788122892379761, |
| "learning_rate": 5.7235570646253385e-05, |
| "loss": 0.4037, |
| "num_tokens": 92497696.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.017382050372472507, |
| "grad_norm": 0.33565449714660645, |
| "learning_rate": 5.71897410140621e-05, |
| "loss": 0.4794, |
| "num_tokens": 93136961.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.017500295613101572, |
| "grad_norm": 0.3093065619468689, |
| "learning_rate": 5.7143555453482564e-05, |
| "loss": 0.46, |
| "num_tokens": 93763389.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.017618540853730637, |
| "grad_norm": 0.28062355518341064, |
| "learning_rate": 5.709701464409014e-05, |
| "loss": 0.4594, |
| "num_tokens": 94396681.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.0177367860943597, |
| "grad_norm": 0.29357820749282837, |
| "learning_rate": 5.705011927068734e-05, |
| "loss": 0.4611, |
| "num_tokens": 95024975.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.017855031334988766, |
| "grad_norm": 0.37621134519577026, |
| "learning_rate": 5.700287002329374e-05, |
| "loss": 0.4681, |
| "num_tokens": 95647926.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.01797327657561783, |
| "grad_norm": 0.3109932541847229, |
| "learning_rate": 5.6955267597135795e-05, |
| "loss": 0.4347, |
| "num_tokens": 96284873.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.018091521816246896, |
| "grad_norm": 0.33683377504348755, |
| "learning_rate": 5.6907312692636665e-05, |
| "loss": 0.4484, |
| "num_tokens": 96921347.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01820976705687596, |
| "grad_norm": 0.29445359110832214, |
| "learning_rate": 5.6859006015405905e-05, |
| "loss": 0.3997, |
| "num_tokens": 97555490.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.018328012297505025, |
| "grad_norm": 0.32711490988731384, |
| "learning_rate": 5.681034827622904e-05, |
| "loss": 0.4153, |
| "num_tokens": 98193055.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.01844625753813409, |
| "grad_norm": 0.29570791125297546, |
| "learning_rate": 5.67613401910571e-05, |
| "loss": 0.3944, |
| "num_tokens": 98826897.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.018564502778763155, |
| "grad_norm": 0.3205905854701996, |
| "learning_rate": 5.671198248099617e-05, |
| "loss": 0.4673, |
| "num_tokens": 99462013.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01868274801939222, |
| "grad_norm": 0.29417866468429565, |
| "learning_rate": 5.666227587229669e-05, |
| "loss": 0.4771, |
| "num_tokens": 100097628.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.018800993260021284, |
| "grad_norm": 0.2989625036716461, |
| "learning_rate": 5.66122210963428e-05, |
| "loss": 0.4152, |
| "num_tokens": 100734556.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.01891923850065035, |
| "grad_norm": 0.3053020238876343, |
| "learning_rate": 5.656181888964159e-05, |
| "loss": 0.4606, |
| "num_tokens": 101371427.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.019037483741279414, |
| "grad_norm": 0.2914108633995056, |
| "learning_rate": 5.6511069993812255e-05, |
| "loss": 0.4647, |
| "num_tokens": 102008014.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.01915572898190848, |
| "grad_norm": 0.31419283151626587, |
| "learning_rate": 5.645997515557518e-05, |
| "loss": 0.4277, |
| "num_tokens": 102647195.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.019273974222537543, |
| "grad_norm": 0.25925683975219727, |
| "learning_rate": 5.640853512674095e-05, |
| "loss": 0.4409, |
| "num_tokens": 103272117.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.019392219463166608, |
| "grad_norm": 0.29054054617881775, |
| "learning_rate": 5.63567506641993e-05, |
| "loss": 0.4468, |
| "num_tokens": 103911617.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.019510464703795673, |
| "grad_norm": 0.2996600270271301, |
| "learning_rate": 5.630462252990796e-05, |
| "loss": 0.4583, |
| "num_tokens": 104546025.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.019628709944424737, |
| "grad_norm": 0.26758819818496704, |
| "learning_rate": 5.6252151490881474e-05, |
| "loss": 0.4193, |
| "num_tokens": 105181492.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.019746955185053802, |
| "grad_norm": 0.28083500266075134, |
| "learning_rate": 5.6199338319179856e-05, |
| "loss": 0.4166, |
| "num_tokens": 105818707.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.019865200425682867, |
| "grad_norm": 0.2543669641017914, |
| "learning_rate": 5.614618379189731e-05, |
| "loss": 0.3928, |
| "num_tokens": 106447672.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.01998344566631193, |
| "grad_norm": 0.29574477672576904, |
| "learning_rate": 5.609268869115072e-05, |
| "loss": 0.4303, |
| "num_tokens": 107079280.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.020101690906940996, |
| "grad_norm": 0.2757669985294342, |
| "learning_rate": 5.6038853804068205e-05, |
| "loss": 0.4325, |
| "num_tokens": 107716692.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.02021993614757006, |
| "grad_norm": 0.3341258764266968, |
| "learning_rate": 5.598467992277748e-05, |
| "loss": 0.4302, |
| "num_tokens": 108346190.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.020338181388199126, |
| "grad_norm": 0.2687680423259735, |
| "learning_rate": 5.5930167844394255e-05, |
| "loss": 0.4188, |
| "num_tokens": 108972655.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.02045642662882819, |
| "grad_norm": 0.3229896128177643, |
| "learning_rate": 5.587531837101046e-05, |
| "loss": 0.4436, |
| "num_tokens": 109606533.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.020574671869457255, |
| "grad_norm": 0.2820740044116974, |
| "learning_rate": 5.582013230968246e-05, |
| "loss": 0.4294, |
| "num_tokens": 110242667.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.02069291711008632, |
| "grad_norm": 0.35922062397003174, |
| "learning_rate": 5.5764610472419194e-05, |
| "loss": 0.4835, |
| "num_tokens": 110879342.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.020811162350715385, |
| "grad_norm": 0.2997070550918579, |
| "learning_rate": 5.5708753676170236e-05, |
| "loss": 0.4347, |
| "num_tokens": 111515578.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.02092940759134445, |
| "grad_norm": 0.2995700240135193, |
| "learning_rate": 5.565256274281369e-05, |
| "loss": 0.395, |
| "num_tokens": 112148074.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.021047652831973514, |
| "grad_norm": 0.319938600063324, |
| "learning_rate": 5.5596038499144235e-05, |
| "loss": 0.4813, |
| "num_tokens": 112784825.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.02116589807260258, |
| "grad_norm": 0.338448166847229, |
| "learning_rate": 5.5539181776860835e-05, |
| "loss": 0.457, |
| "num_tokens": 113415511.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.021284143313231644, |
| "grad_norm": 0.318758487701416, |
| "learning_rate": 5.548199341255457e-05, |
| "loss": 0.4566, |
| "num_tokens": 114014233.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.02140238855386071, |
| "grad_norm": 0.322611927986145, |
| "learning_rate": 5.542447424769632e-05, |
| "loss": 0.4384, |
| "num_tokens": 114646091.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.021520633794489773, |
| "grad_norm": 0.3043844699859619, |
| "learning_rate": 5.536662512862434e-05, |
| "loss": 0.4125, |
| "num_tokens": 115248849.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.021638879035118838, |
| "grad_norm": 0.30535179376602173, |
| "learning_rate": 5.530844690653187e-05, |
| "loss": 0.4083, |
| "num_tokens": 115882858.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.0217571242757479, |
| "grad_norm": 0.29622629284858704, |
| "learning_rate": 5.524994043745455e-05, |
| "loss": 0.4424, |
| "num_tokens": 116516321.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.021875369516376964, |
| "grad_norm": 0.3178810775279999, |
| "learning_rate": 5.519110658225789e-05, |
| "loss": 0.4187, |
| "num_tokens": 117149980.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02199361475700603, |
| "grad_norm": 0.2797812819480896, |
| "learning_rate": 5.513194620662453e-05, |
| "loss": 0.4033, |
| "num_tokens": 117787055.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.022111859997635094, |
| "grad_norm": 0.34582096338272095, |
| "learning_rate": 5.5072460181041565e-05, |
| "loss": 0.4264, |
| "num_tokens": 118414231.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.02223010523826416, |
| "grad_norm": 0.3072027266025543, |
| "learning_rate": 5.5012649380787697e-05, |
| "loss": 0.4425, |
| "num_tokens": 119042723.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.022348350478893223, |
| "grad_norm": 0.27631890773773193, |
| "learning_rate": 5.495251468592038e-05, |
| "loss": 0.47, |
| "num_tokens": 119680244.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.022466595719522288, |
| "grad_norm": 0.39626777172088623, |
| "learning_rate": 5.489205698126284e-05, |
| "loss": 0.4255, |
| "num_tokens": 120319137.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.022584840960151353, |
| "grad_norm": 0.28610390424728394, |
| "learning_rate": 5.483127715639111e-05, |
| "loss": 0.4364, |
| "num_tokens": 120954282.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.022703086200780417, |
| "grad_norm": 0.4135710597038269, |
| "learning_rate": 5.477017610562086e-05, |
| "loss": 0.4342, |
| "num_tokens": 121589180.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.022821331441409482, |
| "grad_norm": 0.3279666304588318, |
| "learning_rate": 5.4708754727994347e-05, |
| "loss": 0.4693, |
| "num_tokens": 122226045.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.022939576682038547, |
| "grad_norm": 0.3193162679672241, |
| "learning_rate": 5.4647013927267055e-05, |
| "loss": 0.411, |
| "num_tokens": 122863565.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.02305782192266761, |
| "grad_norm": 0.3436163067817688, |
| "learning_rate": 5.4584954611894535e-05, |
| "loss": 0.4065, |
| "num_tokens": 123498631.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.023176067163296676, |
| "grad_norm": 0.3175835907459259, |
| "learning_rate": 5.452257769501891e-05, |
| "loss": 0.4343, |
| "num_tokens": 124134670.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.02329431240392574, |
| "grad_norm": 0.276996374130249, |
| "learning_rate": 5.445988409445553e-05, |
| "loss": 0.4125, |
| "num_tokens": 124770499.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.023412557644554806, |
| "grad_norm": 0.3553844690322876, |
| "learning_rate": 5.4396874732679444e-05, |
| "loss": 0.4659, |
| "num_tokens": 125409234.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.02353080288518387, |
| "grad_norm": 0.26136353611946106, |
| "learning_rate": 5.433355053681179e-05, |
| "loss": 0.4354, |
| "num_tokens": 126041885.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.023649048125812935, |
| "grad_norm": 0.3091793656349182, |
| "learning_rate": 5.42699124386062e-05, |
| "loss": 0.4539, |
| "num_tokens": 126679673.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.023767293366442, |
| "grad_norm": 0.3038508892059326, |
| "learning_rate": 5.420596137443508e-05, |
| "loss": 0.4468, |
| "num_tokens": 127318553.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.023885538607071065, |
| "grad_norm": 0.257994145154953, |
| "learning_rate": 5.41416982852758e-05, |
| "loss": 0.4177, |
| "num_tokens": 127957565.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.02400378384770013, |
| "grad_norm": 0.3154793381690979, |
| "learning_rate": 5.4077124116696884e-05, |
| "loss": 0.4944, |
| "num_tokens": 128588826.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.024122029088329194, |
| "grad_norm": 0.30118247866630554, |
| "learning_rate": 5.401223981884411e-05, |
| "loss": 0.4431, |
| "num_tokens": 129222173.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.02424027432895826, |
| "grad_norm": 0.26696497201919556, |
| "learning_rate": 5.3947046346426456e-05, |
| "loss": 0.4586, |
| "num_tokens": 129857385.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.024358519569587324, |
| "grad_norm": 0.25432252883911133, |
| "learning_rate": 5.3881544658702133e-05, |
| "loss": 0.3814, |
| "num_tokens": 130486516.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.02447676481021639, |
| "grad_norm": 0.27828487753868103, |
| "learning_rate": 5.381573571946445e-05, |
| "loss": 0.4529, |
| "num_tokens": 131117306.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.024595010050845453, |
| "grad_norm": 0.29483503103256226, |
| "learning_rate": 5.374962049702759e-05, |
| "loss": 0.4738, |
| "num_tokens": 131749433.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.024713255291474518, |
| "grad_norm": 0.2637292742729187, |
| "learning_rate": 5.3683199964212405e-05, |
| "loss": 0.4242, |
| "num_tokens": 132382579.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.024831500532103583, |
| "grad_norm": 0.2828076183795929, |
| "learning_rate": 5.3616475098332105e-05, |
| "loss": 0.4374, |
| "num_tokens": 133017061.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.024949745772732648, |
| "grad_norm": 0.27759385108947754, |
| "learning_rate": 5.3549446881177853e-05, |
| "loss": 0.4296, |
| "num_tokens": 133645920.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.025067991013361712, |
| "grad_norm": 0.26630890369415283, |
| "learning_rate": 5.3482116299004336e-05, |
| "loss": 0.468, |
| "num_tokens": 134277976.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.025186236253990777, |
| "grad_norm": 0.24754807353019714, |
| "learning_rate": 5.341448434251522e-05, |
| "loss": 0.4468, |
| "num_tokens": 134913386.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.025304481494619842, |
| "grad_norm": 0.27732178568840027, |
| "learning_rate": 5.334655200684864e-05, |
| "loss": 0.4323, |
| "num_tokens": 135544399.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.025422726735248907, |
| "grad_norm": 0.30716535449028015, |
| "learning_rate": 5.327832029156247e-05, |
| "loss": 0.441, |
| "num_tokens": 136182707.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.02554097197587797, |
| "grad_norm": 0.26287323236465454, |
| "learning_rate": 5.3209790200619726e-05, |
| "loss": 0.436, |
| "num_tokens": 136819793.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.025659217216507036, |
| "grad_norm": 0.28410691022872925, |
| "learning_rate": 5.314096274237367e-05, |
| "loss": 0.4414, |
| "num_tokens": 137459203.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.0257774624571361, |
| "grad_norm": 0.27251100540161133, |
| "learning_rate": 5.3071838929553065e-05, |
| "loss": 0.4345, |
| "num_tokens": 138086108.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.025895707697765166, |
| "grad_norm": 0.24234391748905182, |
| "learning_rate": 5.300241977924722e-05, |
| "loss": 0.4244, |
| "num_tokens": 138717361.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.02601395293839423, |
| "grad_norm": 0.31852856278419495, |
| "learning_rate": 5.293270631289107e-05, |
| "loss": 0.408, |
| "num_tokens": 139353768.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.026132198179023295, |
| "grad_norm": 0.29865893721580505, |
| "learning_rate": 5.286269955625011e-05, |
| "loss": 0.4701, |
| "num_tokens": 139986012.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02625044341965236, |
| "grad_norm": 0.27321770787239075, |
| "learning_rate": 5.279240053940531e-05, |
| "loss": 0.4059, |
| "num_tokens": 140618557.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.026368688660281425, |
| "grad_norm": 0.29831984639167786, |
| "learning_rate": 5.2721810296737984e-05, |
| "loss": 0.3978, |
| "num_tokens": 141253328.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.02648693390091049, |
| "grad_norm": 0.3366415798664093, |
| "learning_rate": 5.265092986691453e-05, |
| "loss": 0.4354, |
| "num_tokens": 141885327.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.026605179141539554, |
| "grad_norm": 0.27974531054496765, |
| "learning_rate": 5.257976029287117e-05, |
| "loss": 0.4497, |
| "num_tokens": 142518760.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02672342438216862, |
| "grad_norm": 0.31790000200271606, |
| "learning_rate": 5.250830262179859e-05, |
| "loss": 0.4561, |
| "num_tokens": 143154500.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.026841669622797684, |
| "grad_norm": 0.2560494840145111, |
| "learning_rate": 5.243655790512659e-05, |
| "loss": 0.4402, |
| "num_tokens": 143792062.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.026959914863426748, |
| "grad_norm": 0.27620622515678406, |
| "learning_rate": 5.236452719850849e-05, |
| "loss": 0.3912, |
| "num_tokens": 144424810.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.027078160104055813, |
| "grad_norm": 0.2581166625022888, |
| "learning_rate": 5.2292211561805726e-05, |
| "loss": 0.4145, |
| "num_tokens": 145021445.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.027196405344684878, |
| "grad_norm": 0.297852486371994, |
| "learning_rate": 5.2219612059072196e-05, |
| "loss": 0.4675, |
| "num_tokens": 145656556.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.027314650585313942, |
| "grad_norm": 0.286258339881897, |
| "learning_rate": 5.214672975853859e-05, |
| "loss": 0.413, |
| "num_tokens": 146293020.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.027432895825943007, |
| "grad_norm": 0.2543971538543701, |
| "learning_rate": 5.207356573259671e-05, |
| "loss": 0.4335, |
| "num_tokens": 146922200.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.027551141066572072, |
| "grad_norm": 0.29354169964790344, |
| "learning_rate": 5.2000121057783674e-05, |
| "loss": 0.4786, |
| "num_tokens": 147560483.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.027669386307201137, |
| "grad_norm": 0.22866986691951752, |
| "learning_rate": 5.1926396814766034e-05, |
| "loss": 0.4198, |
| "num_tokens": 148198475.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.0277876315478302, |
| "grad_norm": 0.2605131268501282, |
| "learning_rate": 5.185239408832397e-05, |
| "loss": 0.4363, |
| "num_tokens": 148811827.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.027905876788459266, |
| "grad_norm": 0.22731252014636993, |
| "learning_rate": 5.177811396733523e-05, |
| "loss": 0.4034, |
| "num_tokens": 149446588.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.028024122029088328, |
| "grad_norm": 0.23291230201721191, |
| "learning_rate": 5.170355754475919e-05, |
| "loss": 0.3862, |
| "num_tokens": 150080880.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.028142367269717392, |
| "grad_norm": 0.2324601113796234, |
| "learning_rate": 5.162872591762069e-05, |
| "loss": 0.4557, |
| "num_tokens": 150720517.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.028260612510346457, |
| "grad_norm": 0.2646247148513794, |
| "learning_rate": 5.155362018699396e-05, |
| "loss": 0.4241, |
| "num_tokens": 151354865.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.028378857750975522, |
| "grad_norm": 0.2472905069589615, |
| "learning_rate": 5.147824145798643e-05, |
| "loss": 0.3896, |
| "num_tokens": 151989302.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.028497102991604586, |
| "grad_norm": 0.27559757232666016, |
| "learning_rate": 5.1402590839722356e-05, |
| "loss": 0.4254, |
| "num_tokens": 152622644.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.02861534823223365, |
| "grad_norm": 0.2577532231807709, |
| "learning_rate": 5.132666944532664e-05, |
| "loss": 0.4598, |
| "num_tokens": 153254978.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.028733593472862716, |
| "grad_norm": 0.26456958055496216, |
| "learning_rate": 5.125047839190837e-05, |
| "loss": 0.4006, |
| "num_tokens": 153888439.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.02885183871349178, |
| "grad_norm": 0.23455888032913208, |
| "learning_rate": 5.1174018800544395e-05, |
| "loss": 0.377, |
| "num_tokens": 154521234.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.028970083954120845, |
| "grad_norm": 0.2502966821193695, |
| "learning_rate": 5.1097291796262854e-05, |
| "loss": 0.4257, |
| "num_tokens": 155156546.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.02908832919474991, |
| "grad_norm": 0.26023155450820923, |
| "learning_rate": 5.10202985080266e-05, |
| "loss": 0.4268, |
| "num_tokens": 155791974.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.029206574435378975, |
| "grad_norm": 0.23456987738609314, |
| "learning_rate": 5.0943040068716584e-05, |
| "loss": 0.385, |
| "num_tokens": 156417737.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.02932481967600804, |
| "grad_norm": 0.23901493847370148, |
| "learning_rate": 5.086551761511521e-05, |
| "loss": 0.4553, |
| "num_tokens": 157054324.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.029443064916637104, |
| "grad_norm": 0.26856529712677, |
| "learning_rate": 5.0787732287889574e-05, |
| "loss": 0.4435, |
| "num_tokens": 157686875.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.02956131015726617, |
| "grad_norm": 0.28496497869491577, |
| "learning_rate": 5.070968523157474e-05, |
| "loss": 0.4373, |
| "num_tokens": 158318798.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.029679555397895234, |
| "grad_norm": 0.2572629451751709, |
| "learning_rate": 5.0631377594556795e-05, |
| "loss": 0.451, |
| "num_tokens": 158956587.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0297978006385243, |
| "grad_norm": 0.25811442732810974, |
| "learning_rate": 5.05528105290561e-05, |
| "loss": 0.3716, |
| "num_tokens": 159591859.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.029916045879153363, |
| "grad_norm": 0.266215980052948, |
| "learning_rate": 5.047398519111017e-05, |
| "loss": 0.4106, |
| "num_tokens": 160224798.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.030034291119782428, |
| "grad_norm": 0.2669126093387604, |
| "learning_rate": 5.0394902740556806e-05, |
| "loss": 0.4158, |
| "num_tokens": 160855622.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.030152536360411493, |
| "grad_norm": 0.27752405405044556, |
| "learning_rate": 5.031556434101694e-05, |
| "loss": 0.3848, |
| "num_tokens": 161489536.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.030270781601040558, |
| "grad_norm": 0.2833244502544403, |
| "learning_rate": 5.023597115987755e-05, |
| "loss": 0.4691, |
| "num_tokens": 162123541.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.030389026841669622, |
| "grad_norm": 0.23394179344177246, |
| "learning_rate": 5.0156124368274474e-05, |
| "loss": 0.4263, |
| "num_tokens": 162754194.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.030507272082298687, |
| "grad_norm": 0.2544839680194855, |
| "learning_rate": 5.007602514107518e-05, |
| "loss": 0.4125, |
| "num_tokens": 163389806.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.030625517322927752, |
| "grad_norm": 0.23980510234832764, |
| "learning_rate": 4.99956746568615e-05, |
| "loss": 0.4008, |
| "num_tokens": 164022489.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.030743762563556817, |
| "grad_norm": 0.2352251559495926, |
| "learning_rate": 4.991507409791223e-05, |
| "loss": 0.4105, |
| "num_tokens": 164655264.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.03086200780418588, |
| "grad_norm": 0.23630301654338837, |
| "learning_rate": 4.983422465018581e-05, |
| "loss": 0.4021, |
| "num_tokens": 165293663.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.030980253044814946, |
| "grad_norm": 0.25700318813323975, |
| "learning_rate": 4.975312750330279e-05, |
| "loss": 0.4257, |
| "num_tokens": 165930344.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.03109849828544401, |
| "grad_norm": 0.22052328288555145, |
| "learning_rate": 4.967178385052841e-05, |
| "loss": 0.3837, |
| "num_tokens": 166565332.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.031216743526073076, |
| "grad_norm": 0.25492745637893677, |
| "learning_rate": 4.959019488875499e-05, |
| "loss": 0.4271, |
| "num_tokens": 167200224.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.03133498876670214, |
| "grad_norm": 0.23811548948287964, |
| "learning_rate": 4.9508361818484334e-05, |
| "loss": 0.4191, |
| "num_tokens": 167832144.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.0314532340073312, |
| "grad_norm": 0.22468101978302002, |
| "learning_rate": 4.9426285843810045e-05, |
| "loss": 0.3999, |
| "num_tokens": 168469994.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.03157147924796027, |
| "grad_norm": 0.2614614963531494, |
| "learning_rate": 4.934396817239986e-05, |
| "loss": 0.4342, |
| "num_tokens": 169106098.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.03168972448858933, |
| "grad_norm": 0.22566261887550354, |
| "learning_rate": 4.926141001547783e-05, |
| "loss": 0.3746, |
| "num_tokens": 169737993.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.0318079697292184, |
| "grad_norm": 0.2282998412847519, |
| "learning_rate": 4.91786125878065e-05, |
| "loss": 0.3858, |
| "num_tokens": 170368812.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03192621496984746, |
| "grad_norm": 0.25831902027130127, |
| "learning_rate": 4.9095577107669084e-05, |
| "loss": 0.4595, |
| "num_tokens": 171006424.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.03204446021047653, |
| "grad_norm": 0.22491995990276337, |
| "learning_rate": 4.9012304796851486e-05, |
| "loss": 0.4136, |
| "num_tokens": 171645721.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.03216270545110559, |
| "grad_norm": 0.25414589047431946, |
| "learning_rate": 4.892879688062432e-05, |
| "loss": 0.4001, |
| "num_tokens": 172281075.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.03228095069173466, |
| "grad_norm": 0.22207149863243103, |
| "learning_rate": 4.884505458772495e-05, |
| "loss": 0.3639, |
| "num_tokens": 172914063.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03239919593236372, |
| "grad_norm": 0.23464854061603546, |
| "learning_rate": 4.876107915033933e-05, |
| "loss": 0.4264, |
| "num_tokens": 173548325.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.03251744117299279, |
| "grad_norm": 0.26920729875564575, |
| "learning_rate": 4.867687180408392e-05, |
| "loss": 0.4248, |
| "num_tokens": 174183975.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.03263568641362185, |
| "grad_norm": 0.22815345227718353, |
| "learning_rate": 4.859243378798748e-05, |
| "loss": 0.398, |
| "num_tokens": 174818549.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.03275393165425092, |
| "grad_norm": 0.232111856341362, |
| "learning_rate": 4.850776634447287e-05, |
| "loss": 0.3862, |
| "num_tokens": 175451113.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03287217689487998, |
| "grad_norm": 0.27156439423561096, |
| "learning_rate": 4.842287071933874e-05, |
| "loss": 0.433, |
| "num_tokens": 176087116.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.03299042213550905, |
| "grad_norm": 0.2743763029575348, |
| "learning_rate": 4.8337748161741207e-05, |
| "loss": 0.4497, |
| "num_tokens": 176724483.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.03310866737613811, |
| "grad_norm": 0.26658013463020325, |
| "learning_rate": 4.825239992417548e-05, |
| "loss": 0.4255, |
| "num_tokens": 177361164.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.033226912616767176, |
| "grad_norm": 0.2353833168745041, |
| "learning_rate": 4.8166827262457436e-05, |
| "loss": 0.3786, |
| "num_tokens": 177999098.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03334515785739624, |
| "grad_norm": 0.26090359687805176, |
| "learning_rate": 4.808103143570511e-05, |
| "loss": 0.4224, |
| "num_tokens": 178627820.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.033463403098025306, |
| "grad_norm": 0.23582051694393158, |
| "learning_rate": 4.7995013706320215e-05, |
| "loss": 0.4088, |
| "num_tokens": 179259176.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.03358164833865437, |
| "grad_norm": 0.26351359486579895, |
| "learning_rate": 4.790877533996955e-05, |
| "loss": 0.4279, |
| "num_tokens": 179890905.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.033699893579283435, |
| "grad_norm": 0.25399163365364075, |
| "learning_rate": 4.7822317605566335e-05, |
| "loss": 0.4169, |
| "num_tokens": 180518445.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.0338181388199125, |
| "grad_norm": 0.2980181872844696, |
| "learning_rate": 4.7735641775251624e-05, |
| "loss": 0.449, |
| "num_tokens": 181154667.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.033936384060541565, |
| "grad_norm": 0.27028796076774597, |
| "learning_rate": 4.764874912437551e-05, |
| "loss": 0.4321, |
| "num_tokens": 181789184.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.034054629301170626, |
| "grad_norm": 0.2491423338651657, |
| "learning_rate": 4.756164093147838e-05, |
| "loss": 0.4155, |
| "num_tokens": 182421462.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.034172874541799694, |
| "grad_norm": 0.3009137213230133, |
| "learning_rate": 4.747431847827214e-05, |
| "loss": 0.4015, |
| "num_tokens": 183056216.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.034291119782428756, |
| "grad_norm": 0.2448507696390152, |
| "learning_rate": 4.73867830496213e-05, |
| "loss": 0.4331, |
| "num_tokens": 183695586.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.034409365023057824, |
| "grad_norm": 0.2819685935974121, |
| "learning_rate": 4.729903593352412e-05, |
| "loss": 0.4017, |
| "num_tokens": 184323170.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.034527610263686885, |
| "grad_norm": 0.3014317750930786, |
| "learning_rate": 4.721107842109362e-05, |
| "loss": 0.4771, |
| "num_tokens": 184923402.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.03464585550431595, |
| "grad_norm": 0.23289276659488678, |
| "learning_rate": 4.712291180653859e-05, |
| "loss": 0.4004, |
| "num_tokens": 185562179.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.034764100744945015, |
| "grad_norm": 0.25156062841415405, |
| "learning_rate": 4.703453738714457e-05, |
| "loss": 0.4127, |
| "num_tokens": 186196488.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.03488234598557408, |
| "grad_norm": 0.27437835931777954, |
| "learning_rate": 4.6945956463254733e-05, |
| "loss": 0.4458, |
| "num_tokens": 186795333.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.035000591226203144, |
| "grad_norm": 0.2655051052570343, |
| "learning_rate": 4.6857170338250756e-05, |
| "loss": 0.3878, |
| "num_tokens": 187431540.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.03511883646683221, |
| "grad_norm": 0.24947364628314972, |
| "learning_rate": 4.676818031853367e-05, |
| "loss": 0.4086, |
| "num_tokens": 188067882.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.035237081707461274, |
| "grad_norm": 0.27399611473083496, |
| "learning_rate": 4.667898771350461e-05, |
| "loss": 0.4469, |
| "num_tokens": 188704706.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.03535532694809034, |
| "grad_norm": 0.23381806910037994, |
| "learning_rate": 4.658959383554554e-05, |
| "loss": 0.3872, |
| "num_tokens": 189339944.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0354735721887194, |
| "grad_norm": 0.30683842301368713, |
| "learning_rate": 4.6500000000000005e-05, |
| "loss": 0.4722, |
| "num_tokens": 189977157.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03559181742934847, |
| "grad_norm": 0.23590795695781708, |
| "learning_rate": 4.641020752515366e-05, |
| "loss": 0.4177, |
| "num_tokens": 190586411.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03571006266997753, |
| "grad_norm": 0.2523725926876068, |
| "learning_rate": 4.632021773221499e-05, |
| "loss": 0.4323, |
| "num_tokens": 191219345.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.0358283079106066, |
| "grad_norm": 0.24050471186637878, |
| "learning_rate": 4.623003194529583e-05, |
| "loss": 0.4244, |
| "num_tokens": 191855183.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.03594655315123566, |
| "grad_norm": 0.24300076067447662, |
| "learning_rate": 4.613965149139185e-05, |
| "loss": 0.3956, |
| "num_tokens": 192485493.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.03606479839186473, |
| "grad_norm": 0.2315610945224762, |
| "learning_rate": 4.6049077700363056e-05, |
| "loss": 0.3896, |
| "num_tokens": 193122797.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03618304363249379, |
| "grad_norm": 0.25560230016708374, |
| "learning_rate": 4.595831190491424e-05, |
| "loss": 0.4167, |
| "num_tokens": 193759752.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.03630128887312286, |
| "grad_norm": 0.25288936495780945, |
| "learning_rate": 4.586735544057531e-05, |
| "loss": 0.4087, |
| "num_tokens": 194394288.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.03641953411375192, |
| "grad_norm": 0.2969334125518799, |
| "learning_rate": 4.5776209645681745e-05, |
| "loss": 0.4075, |
| "num_tokens": 195027778.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.03653777935438099, |
| "grad_norm": 0.22655892372131348, |
| "learning_rate": 4.568487586135478e-05, |
| "loss": 0.3378, |
| "num_tokens": 195660013.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03665602459501005, |
| "grad_norm": 0.28944021463394165, |
| "learning_rate": 4.5593355431481754e-05, |
| "loss": 0.4249, |
| "num_tokens": 196290478.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.03677426983563912, |
| "grad_norm": 0.23864449560642242, |
| "learning_rate": 4.550164970269633e-05, |
| "loss": 0.4412, |
| "num_tokens": 196927060.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.03689251507626818, |
| "grad_norm": 0.247343510389328, |
| "learning_rate": 4.540976002435862e-05, |
| "loss": 0.4384, |
| "num_tokens": 197557085.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.03701076031689725, |
| "grad_norm": 0.2885189950466156, |
| "learning_rate": 4.53176877485354e-05, |
| "loss": 0.4252, |
| "num_tokens": 198189535.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03712900555752631, |
| "grad_norm": 0.2791072428226471, |
| "learning_rate": 4.5225434229980215e-05, |
| "loss": 0.4425, |
| "num_tokens": 198820737.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.03724725079815538, |
| "grad_norm": 0.2613127529621124, |
| "learning_rate": 4.513300082611336e-05, |
| "loss": 0.3994, |
| "num_tokens": 199451792.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.03736549603878444, |
| "grad_norm": 0.2581581473350525, |
| "learning_rate": 4.504038889700201e-05, |
| "loss": 0.4052, |
| "num_tokens": 200086012.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.0374837412794135, |
| "grad_norm": 0.25737565755844116, |
| "learning_rate": 4.494759980534017e-05, |
| "loss": 0.3975, |
| "num_tokens": 200723155.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03760198652004257, |
| "grad_norm": 0.2575814127922058, |
| "learning_rate": 4.4854634916428583e-05, |
| "loss": 0.4188, |
| "num_tokens": 201362056.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.03772023176067163, |
| "grad_norm": 0.24522624909877777, |
| "learning_rate": 4.4761495598154706e-05, |
| "loss": 0.4012, |
| "num_tokens": 201996006.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.0378384770013007, |
| "grad_norm": 0.2399868369102478, |
| "learning_rate": 4.466818322097253e-05, |
| "loss": 0.3726, |
| "num_tokens": 202591057.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.03795672224192976, |
| "grad_norm": 0.23226316273212433, |
| "learning_rate": 4.4574699157882465e-05, |
| "loss": 0.3846, |
| "num_tokens": 203228812.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.03807496748255883, |
| "grad_norm": 0.263351172208786, |
| "learning_rate": 4.44810447844111e-05, |
| "loss": 0.4168, |
| "num_tokens": 203868294.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.03819321272318789, |
| "grad_norm": 0.2123018354177475, |
| "learning_rate": 4.438722147859095e-05, |
| "loss": 0.3815, |
| "num_tokens": 204499481.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.03831145796381696, |
| "grad_norm": 0.2778543531894684, |
| "learning_rate": 4.429323062094026e-05, |
| "loss": 0.3969, |
| "num_tokens": 205133494.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.03842970320444602, |
| "grad_norm": 0.2408173829317093, |
| "learning_rate": 4.419907359444259e-05, |
| "loss": 0.4108, |
| "num_tokens": 205767024.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.038547948445075086, |
| "grad_norm": 0.26782068610191345, |
| "learning_rate": 4.410475178452652e-05, |
| "loss": 0.4291, |
| "num_tokens": 206400825.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.03866619368570415, |
| "grad_norm": 0.26312699913978577, |
| "learning_rate": 4.4010266579045256e-05, |
| "loss": 0.4136, |
| "num_tokens": 207040239.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.038784438926333216, |
| "grad_norm": 0.256391704082489, |
| "learning_rate": 4.391561936825623e-05, |
| "loss": 0.3959, |
| "num_tokens": 207676732.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.03890268416696228, |
| "grad_norm": 0.2285778969526291, |
| "learning_rate": 4.3820811544800617e-05, |
| "loss": 0.3881, |
| "num_tokens": 208313021.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.039020929407591345, |
| "grad_norm": 0.2927227318286896, |
| "learning_rate": 4.372584450368283e-05, |
| "loss": 0.4485, |
| "num_tokens": 208946344.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.03913917464822041, |
| "grad_norm": 0.25876858830451965, |
| "learning_rate": 4.3630719642250034e-05, |
| "loss": 0.4692, |
| "num_tokens": 209577542.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.039257419888849475, |
| "grad_norm": 0.2661622166633606, |
| "learning_rate": 4.3535438360171556e-05, |
| "loss": 0.4608, |
| "num_tokens": 210213046.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.039375665129478536, |
| "grad_norm": 0.2588401436805725, |
| "learning_rate": 4.344000205941831e-05, |
| "loss": 0.4155, |
| "num_tokens": 210848130.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.039493910370107604, |
| "grad_norm": 0.25796785950660706, |
| "learning_rate": 4.3344412144242146e-05, |
| "loss": 0.4037, |
| "num_tokens": 211482121.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.039612155610736666, |
| "grad_norm": 0.2662915587425232, |
| "learning_rate": 4.3248670021155206e-05, |
| "loss": 0.4512, |
| "num_tokens": 212120668.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.039730400851365734, |
| "grad_norm": 0.24322502315044403, |
| "learning_rate": 4.315277709890922e-05, |
| "loss": 0.4174, |
| "num_tokens": 212756102.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.039848646091994795, |
| "grad_norm": 0.2540619671344757, |
| "learning_rate": 4.3056734788474785e-05, |
| "loss": 0.4436, |
| "num_tokens": 213392130.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.03996689133262386, |
| "grad_norm": 0.24154382944107056, |
| "learning_rate": 4.29605445030206e-05, |
| "loss": 0.3931, |
| "num_tokens": 214021365.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.040085136573252925, |
| "grad_norm": 0.24840545654296875, |
| "learning_rate": 4.286420765789267e-05, |
| "loss": 0.4088, |
| "num_tokens": 214651340.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.04020338181388199, |
| "grad_norm": 0.2844981551170349, |
| "learning_rate": 4.276772567059347e-05, |
| "loss": 0.4351, |
| "num_tokens": 215284267.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.040321627054511054, |
| "grad_norm": 0.2580728232860565, |
| "learning_rate": 4.2671099960761116e-05, |
| "loss": 0.4454, |
| "num_tokens": 215920647.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04043987229514012, |
| "grad_norm": 0.2855488061904907, |
| "learning_rate": 4.257433195014846e-05, |
| "loss": 0.3805, |
| "num_tokens": 216555518.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.040558117535769184, |
| "grad_norm": 0.21848393976688385, |
| "learning_rate": 4.247742306260217e-05, |
| "loss": 0.3795, |
| "num_tokens": 217191272.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.04067636277639825, |
| "grad_norm": 0.264885276556015, |
| "learning_rate": 4.238037472404176e-05, |
| "loss": 0.4108, |
| "num_tokens": 217824700.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.04079460801702731, |
| "grad_norm": 0.2161663919687271, |
| "learning_rate": 4.228318836243865e-05, |
| "loss": 0.3729, |
| "num_tokens": 218455560.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04091285325765638, |
| "grad_norm": 0.22689329087734222, |
| "learning_rate": 4.218586540779515e-05, |
| "loss": 0.421, |
| "num_tokens": 219091298.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.04103109849828544, |
| "grad_norm": 0.24377533793449402, |
| "learning_rate": 4.208840729212337e-05, |
| "loss": 0.3951, |
| "num_tokens": 219727733.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.04114934373891451, |
| "grad_norm": 0.24370762705802917, |
| "learning_rate": 4.199081544942418e-05, |
| "loss": 0.4481, |
| "num_tokens": 220360695.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.04126758897954357, |
| "grad_norm": 0.23610427975654602, |
| "learning_rate": 4.189309131566615e-05, |
| "loss": 0.4373, |
| "num_tokens": 220993405.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04138583422017264, |
| "grad_norm": 0.2471226155757904, |
| "learning_rate": 4.1795236328764354e-05, |
| "loss": 0.4307, |
| "num_tokens": 221619384.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0415040794608017, |
| "grad_norm": 0.2555200159549713, |
| "learning_rate": 4.169725192855925e-05, |
| "loss": 0.4149, |
| "num_tokens": 222250253.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.04162232470143077, |
| "grad_norm": 0.26108643412590027, |
| "learning_rate": 4.159913955679548e-05, |
| "loss": 0.4016, |
| "num_tokens": 222884935.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.04174056994205983, |
| "grad_norm": 0.22140191495418549, |
| "learning_rate": 4.150090065710067e-05, |
| "loss": 0.4025, |
| "num_tokens": 223516629.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.0418588151826889, |
| "grad_norm": 0.2396477907896042, |
| "learning_rate": 4.1402536674964195e-05, |
| "loss": 0.4105, |
| "num_tokens": 224150031.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.04197706042331796, |
| "grad_norm": 0.23356612026691437, |
| "learning_rate": 4.130404905771586e-05, |
| "loss": 0.3962, |
| "num_tokens": 224786071.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.04209530566394703, |
| "grad_norm": 0.2547277510166168, |
| "learning_rate": 4.1205439254504666e-05, |
| "loss": 0.4314, |
| "num_tokens": 225421240.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.04221355090457609, |
| "grad_norm": 0.2576862871646881, |
| "learning_rate": 4.110670871627745e-05, |
| "loss": 0.396, |
| "num_tokens": 226052174.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04233179614520516, |
| "grad_norm": 0.22883984446525574, |
| "learning_rate": 4.100785889575757e-05, |
| "loss": 0.4374, |
| "num_tokens": 226689398.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.04245004138583422, |
| "grad_norm": 0.23827779293060303, |
| "learning_rate": 4.090889124742346e-05, |
| "loss": 0.4014, |
| "num_tokens": 227327616.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.04256828662646329, |
| "grad_norm": 0.22566570341587067, |
| "learning_rate": 4.080980722748733e-05, |
| "loss": 0.4054, |
| "num_tokens": 227952686.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.04268653186709235, |
| "grad_norm": 0.2515687644481659, |
| "learning_rate": 4.0710608293873634e-05, |
| "loss": 0.4194, |
| "num_tokens": 228587586.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04280477710772142, |
| "grad_norm": 0.2160085290670395, |
| "learning_rate": 4.0611295906197706e-05, |
| "loss": 0.4048, |
| "num_tokens": 229185285.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.04292302234835048, |
| "grad_norm": 0.21602442860603333, |
| "learning_rate": 4.0511871525744224e-05, |
| "loss": 0.3995, |
| "num_tokens": 229815886.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.04304126758897955, |
| "grad_norm": 0.26638063788414, |
| "learning_rate": 4.041233661544574e-05, |
| "loss": 0.4104, |
| "num_tokens": 230449875.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.04315951282960861, |
| "grad_norm": 0.21101397275924683, |
| "learning_rate": 4.0312692639861146e-05, |
| "loss": 0.4125, |
| "num_tokens": 231087769.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.043277758070237676, |
| "grad_norm": 0.22914250195026398, |
| "learning_rate": 4.021294106515411e-05, |
| "loss": 0.3969, |
| "num_tokens": 231720719.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.04339600331086674, |
| "grad_norm": 0.21389196813106537, |
| "learning_rate": 4.011308335907152e-05, |
| "loss": 0.3922, |
| "num_tokens": 232354694.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.0435142485514958, |
| "grad_norm": 0.22924332320690155, |
| "learning_rate": 4.00131209909219e-05, |
| "loss": 0.4202, |
| "num_tokens": 232986853.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.04363249379212487, |
| "grad_norm": 0.2374032735824585, |
| "learning_rate": 3.991305543155378e-05, |
| "loss": 0.4575, |
| "num_tokens": 233626246.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04375073903275393, |
| "grad_norm": 0.20903757214546204, |
| "learning_rate": 3.981288815333399e-05, |
| "loss": 0.3508, |
| "num_tokens": 234256236.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.043868984273383, |
| "grad_norm": 0.23430699110031128, |
| "learning_rate": 3.971262063012612e-05, |
| "loss": 0.4202, |
| "num_tokens": 234894656.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.04398722951401206, |
| "grad_norm": 0.21054008603096008, |
| "learning_rate": 3.9612254337268734e-05, |
| "loss": 0.4029, |
| "num_tokens": 235530175.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.044105474754641126, |
| "grad_norm": 0.22597409784793854, |
| "learning_rate": 3.95117907515537e-05, |
| "loss": 0.3881, |
| "num_tokens": 236165286.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04422371999527019, |
| "grad_norm": 0.24336762726306915, |
| "learning_rate": 3.941123135120445e-05, |
| "loss": 0.389, |
| "num_tokens": 236799872.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.044341965235899256, |
| "grad_norm": 0.2279030978679657, |
| "learning_rate": 3.9310577615854264e-05, |
| "loss": 0.3643, |
| "num_tokens": 237436361.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.04446021047652832, |
| "grad_norm": 0.20615456998348236, |
| "learning_rate": 3.920983102652443e-05, |
| "loss": 0.3824, |
| "num_tokens": 238072053.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.044578455717157385, |
| "grad_norm": 0.22816775739192963, |
| "learning_rate": 3.910899306560251e-05, |
| "loss": 0.4291, |
| "num_tokens": 238707861.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.044696700957786446, |
| "grad_norm": 0.22566092014312744, |
| "learning_rate": 3.9008065216820486e-05, |
| "loss": 0.3967, |
| "num_tokens": 239340071.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.044814946198415515, |
| "grad_norm": 0.22702094912528992, |
| "learning_rate": 3.890704896523302e-05, |
| "loss": 0.4185, |
| "num_tokens": 239974165.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.044933191439044576, |
| "grad_norm": 0.20416148006916046, |
| "learning_rate": 3.880594579719545e-05, |
| "loss": 0.3879, |
| "num_tokens": 240606077.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.045051436679673644, |
| "grad_norm": 0.2429252415895462, |
| "learning_rate": 3.870475720034206e-05, |
| "loss": 0.4027, |
| "num_tokens": 241243195.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.045169681920302705, |
| "grad_norm": 0.24931378662586212, |
| "learning_rate": 3.860348466356413e-05, |
| "loss": 0.4474, |
| "num_tokens": 241881692.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.045287927160931774, |
| "grad_norm": 0.26254212856292725, |
| "learning_rate": 3.850212967698799e-05, |
| "loss": 0.4189, |
| "num_tokens": 242520949.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.045406172401560835, |
| "grad_norm": 0.2300311028957367, |
| "learning_rate": 3.84006937319532e-05, |
| "loss": 0.3986, |
| "num_tokens": 243160575.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.0455244176421899, |
| "grad_norm": 0.24005557596683502, |
| "learning_rate": 3.829917832099051e-05, |
| "loss": 0.4128, |
| "num_tokens": 243790943.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.045642662882818964, |
| "grad_norm": 0.2699725031852722, |
| "learning_rate": 3.819758493779992e-05, |
| "loss": 0.4602, |
| "num_tokens": 244423844.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.04576090812344803, |
| "grad_norm": 0.23983405530452728, |
| "learning_rate": 3.8095915077228754e-05, |
| "loss": 0.3914, |
| "num_tokens": 245054470.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.045879153364077094, |
| "grad_norm": 0.2433352917432785, |
| "learning_rate": 3.79941702352496e-05, |
| "loss": 0.3811, |
| "num_tokens": 245688487.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.04599739860470616, |
| "grad_norm": 0.24374330043792725, |
| "learning_rate": 3.7892351908938326e-05, |
| "loss": 0.4106, |
| "num_tokens": 246325682.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04611564384533522, |
| "grad_norm": 0.21965977549552917, |
| "learning_rate": 3.7790461596452057e-05, |
| "loss": 0.4311, |
| "num_tokens": 246961506.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.04623388908596429, |
| "grad_norm": 0.23189356923103333, |
| "learning_rate": 3.7688500797007124e-05, |
| "loss": 0.3798, |
| "num_tokens": 247594032.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.04635213432659335, |
| "grad_norm": 0.2253284901380539, |
| "learning_rate": 3.758647101085699e-05, |
| "loss": 0.427, |
| "num_tokens": 248227593.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.04647037956722242, |
| "grad_norm": 0.2451157420873642, |
| "learning_rate": 3.748437373927022e-05, |
| "loss": 0.4083, |
| "num_tokens": 248859376.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04658862480785148, |
| "grad_norm": 0.22249139845371246, |
| "learning_rate": 3.738221048450834e-05, |
| "loss": 0.4254, |
| "num_tokens": 249493350.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.04670687004848055, |
| "grad_norm": 0.24161191284656525, |
| "learning_rate": 3.7279982749803736e-05, |
| "loss": 0.3853, |
| "num_tokens": 250126507.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.04682511528910961, |
| "grad_norm": 0.23410917818546295, |
| "learning_rate": 3.717769203933759e-05, |
| "loss": 0.424, |
| "num_tokens": 250765825.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.04694336052973868, |
| "grad_norm": 0.23993557691574097, |
| "learning_rate": 3.7075339858217706e-05, |
| "loss": 0.4189, |
| "num_tokens": 251403372.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04706160577036774, |
| "grad_norm": 0.32063019275665283, |
| "learning_rate": 3.697292771245633e-05, |
| "loss": 0.4365, |
| "num_tokens": 252041762.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.04717985101099681, |
| "grad_norm": 0.21619325876235962, |
| "learning_rate": 3.687045710894808e-05, |
| "loss": 0.4027, |
| "num_tokens": 252673716.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.04729809625162587, |
| "grad_norm": 0.2204645574092865, |
| "learning_rate": 3.67679295554477e-05, |
| "loss": 0.4117, |
| "num_tokens": 253310805.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04741634149225494, |
| "grad_norm": 0.21249093115329742, |
| "learning_rate": 3.666534656054788e-05, |
| "loss": 0.3398, |
| "num_tokens": 253944650.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.047534586732884, |
| "grad_norm": 0.2515881061553955, |
| "learning_rate": 3.65627096336571e-05, |
| "loss": 0.3814, |
| "num_tokens": 254579224.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.04765283197351307, |
| "grad_norm": 0.22720524668693542, |
| "learning_rate": 3.646002028497738e-05, |
| "loss": 0.3981, |
| "num_tokens": 255212393.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.04777107721414213, |
| "grad_norm": 0.24506784975528717, |
| "learning_rate": 3.63572800254821e-05, |
| "loss": 0.4215, |
| "num_tokens": 255846216.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.0478893224547712, |
| "grad_norm": 0.25425246357917786, |
| "learning_rate": 3.625449036689372e-05, |
| "loss": 0.429, |
| "num_tokens": 256484541.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.04800756769540026, |
| "grad_norm": 0.23869769275188446, |
| "learning_rate": 3.6151652821661576e-05, |
| "loss": 0.3881, |
| "num_tokens": 257114691.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.04812581293602933, |
| "grad_norm": 0.2546592056751251, |
| "learning_rate": 3.604876890293959e-05, |
| "loss": 0.4059, |
| "num_tokens": 257748044.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.04824405817665839, |
| "grad_norm": 0.22846068441867828, |
| "learning_rate": 3.594584012456403e-05, |
| "loss": 0.3613, |
| "num_tokens": 258386984.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.04836230341728746, |
| "grad_norm": 0.24633820354938507, |
| "learning_rate": 3.584286800103124e-05, |
| "loss": 0.4298, |
| "num_tokens": 259023318.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.04848054865791652, |
| "grad_norm": 0.2492648810148239, |
| "learning_rate": 3.573985404747535e-05, |
| "loss": 0.383, |
| "num_tokens": 259657204.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.048598793898545586, |
| "grad_norm": 0.22464512288570404, |
| "learning_rate": 3.563679977964595e-05, |
| "loss": 0.3838, |
| "num_tokens": 260290556.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.04871703913917465, |
| "grad_norm": 0.27683940529823303, |
| "learning_rate": 3.5533706713885844e-05, |
| "loss": 0.4461, |
| "num_tokens": 260928576.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.048835284379803716, |
| "grad_norm": 0.20443028211593628, |
| "learning_rate": 3.5430576367108694e-05, |
| "loss": 0.3948, |
| "num_tokens": 261563484.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.04895352962043278, |
| "grad_norm": 0.20911704003810883, |
| "learning_rate": 3.532741025677673e-05, |
| "loss": 0.3649, |
| "num_tokens": 262198058.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.049071774861061845, |
| "grad_norm": 0.27862629294395447, |
| "learning_rate": 3.522420990087839e-05, |
| "loss": 0.4237, |
| "num_tokens": 262825300.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.04919002010169091, |
| "grad_norm": 0.2638210654258728, |
| "learning_rate": 3.5120976817906e-05, |
| "loss": 0.4384, |
| "num_tokens": 263458362.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.049308265342319975, |
| "grad_norm": 0.24697713553905487, |
| "learning_rate": 3.5017712526833454e-05, |
| "loss": 0.3814, |
| "num_tokens": 264088367.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.049426510582949036, |
| "grad_norm": 0.2173382192850113, |
| "learning_rate": 3.491441854709384e-05, |
| "loss": 0.3949, |
| "num_tokens": 264724592.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.049544755823578104, |
| "grad_norm": 0.25613975524902344, |
| "learning_rate": 3.481109639855707e-05, |
| "loss": 0.3821, |
| "num_tokens": 265360262.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.049663001064207166, |
| "grad_norm": 0.24708124995231628, |
| "learning_rate": 3.470774760150753e-05, |
| "loss": 0.4341, |
| "num_tokens": 265997689.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.04978124630483623, |
| "grad_norm": 0.23348525166511536, |
| "learning_rate": 3.460437367662173e-05, |
| "loss": 0.4044, |
| "num_tokens": 266631262.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.049899491545465295, |
| "grad_norm": 0.23553021252155304, |
| "learning_rate": 3.450097614494592e-05, |
| "loss": 0.3966, |
| "num_tokens": 267268979.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.050017736786094356, |
| "grad_norm": 0.2573988139629364, |
| "learning_rate": 3.439755652787366e-05, |
| "loss": 0.4017, |
| "num_tokens": 267904627.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.050135982026723425, |
| "grad_norm": 0.22248908877372742, |
| "learning_rate": 3.4294116347123505e-05, |
| "loss": 0.357, |
| "num_tokens": 268543181.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.050254227267352486, |
| "grad_norm": 0.22894316911697388, |
| "learning_rate": 3.419065712471659e-05, |
| "loss": 0.4027, |
| "num_tokens": 269179996.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.050372472507981554, |
| "grad_norm": 0.25380998849868774, |
| "learning_rate": 3.4087180382954214e-05, |
| "loss": 0.3843, |
| "num_tokens": 269811253.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.050490717748610615, |
| "grad_norm": 0.23106823861598969, |
| "learning_rate": 3.398368764439546e-05, |
| "loss": 0.39, |
| "num_tokens": 270441984.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.050608962989239684, |
| "grad_norm": 0.22412751615047455, |
| "learning_rate": 3.388018043183478e-05, |
| "loss": 0.3997, |
| "num_tokens": 271074224.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.050727208229868745, |
| "grad_norm": 0.2578945457935333, |
| "learning_rate": 3.377666026827962e-05, |
| "loss": 0.446, |
| "num_tokens": 271705707.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05084545347049781, |
| "grad_norm": 0.23338672518730164, |
| "learning_rate": 3.367312867692797e-05, |
| "loss": 0.379, |
| "num_tokens": 272318335.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.050963698711126874, |
| "grad_norm": 0.209132581949234, |
| "learning_rate": 3.3569587181145974e-05, |
| "loss": 0.416, |
| "num_tokens": 272955472.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.05108194395175594, |
| "grad_norm": 0.21573707461357117, |
| "learning_rate": 3.346603730444549e-05, |
| "loss": 0.4051, |
| "num_tokens": 273559901.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.051200189192385004, |
| "grad_norm": 0.24565227329730988, |
| "learning_rate": 3.336248057046174e-05, |
| "loss": 0.4033, |
| "num_tokens": 274192648.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05131843443301407, |
| "grad_norm": 0.20935006439685822, |
| "learning_rate": 3.325891850293078e-05, |
| "loss": 0.373, |
| "num_tokens": 274831825.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.05143667967364313, |
| "grad_norm": 0.2096380591392517, |
| "learning_rate": 3.315535262566722e-05, |
| "loss": 0.358, |
| "num_tokens": 275456510.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0515549249142722, |
| "grad_norm": 0.22659966349601746, |
| "learning_rate": 3.305178446254166e-05, |
| "loss": 0.3623, |
| "num_tokens": 276091876.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.05167317015490126, |
| "grad_norm": 0.21803656220436096, |
| "learning_rate": 3.294821553745835e-05, |
| "loss": 0.4107, |
| "num_tokens": 276727335.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05179141539553033, |
| "grad_norm": 0.207914799451828, |
| "learning_rate": 3.284464737433279e-05, |
| "loss": 0.4361, |
| "num_tokens": 277359081.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.05190966063615939, |
| "grad_norm": 0.20631778240203857, |
| "learning_rate": 3.2741081497069215e-05, |
| "loss": 0.3707, |
| "num_tokens": 277990765.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.05202790587678846, |
| "grad_norm": 0.2180744856595993, |
| "learning_rate": 3.263751942953828e-05, |
| "loss": 0.3962, |
| "num_tokens": 278612675.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05214615111741752, |
| "grad_norm": 0.19695664942264557, |
| "learning_rate": 3.2533962695554515e-05, |
| "loss": 0.3742, |
| "num_tokens": 279246447.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05226439635804659, |
| "grad_norm": 0.2566263973712921, |
| "learning_rate": 3.243041281885404e-05, |
| "loss": 0.4438, |
| "num_tokens": 279880451.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.05238264159867565, |
| "grad_norm": 0.25131720304489136, |
| "learning_rate": 3.232687132307204e-05, |
| "loss": 0.4346, |
| "num_tokens": 280517149.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.05250088683930472, |
| "grad_norm": 0.21113261580467224, |
| "learning_rate": 3.222333973172039e-05, |
| "loss": 0.3724, |
| "num_tokens": 281144009.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.05261913207993378, |
| "grad_norm": 0.19451619684696198, |
| "learning_rate": 3.211981956816523e-05, |
| "loss": 0.3889, |
| "num_tokens": 281781076.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05273737732056285, |
| "grad_norm": 0.20863431692123413, |
| "learning_rate": 3.201631235560456e-05, |
| "loss": 0.3956, |
| "num_tokens": 282418028.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.05285562256119191, |
| "grad_norm": 0.22713254392147064, |
| "learning_rate": 3.1912819617045805e-05, |
| "loss": 0.385, |
| "num_tokens": 283052667.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.05297386780182098, |
| "grad_norm": 0.22602516412734985, |
| "learning_rate": 3.180934287528342e-05, |
| "loss": 0.4338, |
| "num_tokens": 283689384.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.05309211304245004, |
| "grad_norm": 0.18873152136802673, |
| "learning_rate": 3.170588365287651e-05, |
| "loss": 0.3618, |
| "num_tokens": 284325980.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05321035828307911, |
| "grad_norm": 0.22596846520900726, |
| "learning_rate": 3.1602443472126344e-05, |
| "loss": 0.4158, |
| "num_tokens": 284927410.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05332860352370817, |
| "grad_norm": 0.20799311995506287, |
| "learning_rate": 3.1499023855054086e-05, |
| "loss": 0.4023, |
| "num_tokens": 285561853.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.05344684876433724, |
| "grad_norm": 0.21219973266124725, |
| "learning_rate": 3.1395626323378266e-05, |
| "loss": 0.4127, |
| "num_tokens": 286200667.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.0535650940049663, |
| "grad_norm": 0.23271985352039337, |
| "learning_rate": 3.129225239849247e-05, |
| "loss": 0.377, |
| "num_tokens": 286838297.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05368333924559537, |
| "grad_norm": 0.23054036498069763, |
| "learning_rate": 3.118890360144293e-05, |
| "loss": 0.3806, |
| "num_tokens": 287478013.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.05380158448622443, |
| "grad_norm": 0.21830712258815765, |
| "learning_rate": 3.1085581452906166e-05, |
| "loss": 0.4124, |
| "num_tokens": 288113641.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.053919829726853497, |
| "grad_norm": 0.21437396109104156, |
| "learning_rate": 3.0982287473166544e-05, |
| "loss": 0.4056, |
| "num_tokens": 288748606.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.05403807496748256, |
| "grad_norm": 0.23408770561218262, |
| "learning_rate": 3.087902318209401e-05, |
| "loss": 0.3841, |
| "num_tokens": 289387862.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.054156320208111626, |
| "grad_norm": 0.22132480144500732, |
| "learning_rate": 3.0775790099121615e-05, |
| "loss": 0.3859, |
| "num_tokens": 290025351.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.05427456544874069, |
| "grad_norm": 0.21784645318984985, |
| "learning_rate": 3.067258974322328e-05, |
| "loss": 0.3752, |
| "num_tokens": 290660065.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.054392810689369755, |
| "grad_norm": 0.21862168610095978, |
| "learning_rate": 3.056942363289131e-05, |
| "loss": 0.3808, |
| "num_tokens": 291293769.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05451105592999882, |
| "grad_norm": 0.23824048042297363, |
| "learning_rate": 3.0466293286114164e-05, |
| "loss": 0.3878, |
| "num_tokens": 291932970.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.054629301170627885, |
| "grad_norm": 0.20896966755390167, |
| "learning_rate": 3.036320022035405e-05, |
| "loss": 0.3958, |
| "num_tokens": 292570196.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.054747546411256946, |
| "grad_norm": 0.22659938037395477, |
| "learning_rate": 3.0260145952524658e-05, |
| "loss": 0.4281, |
| "num_tokens": 293208953.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.054865791651886014, |
| "grad_norm": 0.25533124804496765, |
| "learning_rate": 3.0157131998968765e-05, |
| "loss": 0.3964, |
| "num_tokens": 293841194.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.054984036892515076, |
| "grad_norm": 0.21840247511863708, |
| "learning_rate": 3.0054159875435977e-05, |
| "loss": 0.4031, |
| "num_tokens": 294478601.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.055102282133144144, |
| "grad_norm": 0.20685090124607086, |
| "learning_rate": 2.995123109706042e-05, |
| "loss": 0.3979, |
| "num_tokens": 295110727.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.055220527373773205, |
| "grad_norm": 0.24118992686271667, |
| "learning_rate": 2.984834717833843e-05, |
| "loss": 0.3724, |
| "num_tokens": 295744614.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.05533877261440227, |
| "grad_norm": 0.21696403622627258, |
| "learning_rate": 2.9745509633106285e-05, |
| "loss": 0.3875, |
| "num_tokens": 296380861.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.055457017855031335, |
| "grad_norm": 0.2347799837589264, |
| "learning_rate": 2.964271997451791e-05, |
| "loss": 0.3951, |
| "num_tokens": 297007279.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.0555752630956604, |
| "grad_norm": 0.2174369990825653, |
| "learning_rate": 2.9539979715022626e-05, |
| "loss": 0.3757, |
| "num_tokens": 297623126.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.055693508336289464, |
| "grad_norm": 0.2637596130371094, |
| "learning_rate": 2.943729036634291e-05, |
| "loss": 0.4343, |
| "num_tokens": 298255789.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.05581175357691853, |
| "grad_norm": 0.20758095383644104, |
| "learning_rate": 2.9334653439452135e-05, |
| "loss": 0.4108, |
| "num_tokens": 298885491.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.055929998817547594, |
| "grad_norm": 0.2174261063337326, |
| "learning_rate": 2.9232070444552315e-05, |
| "loss": 0.3799, |
| "num_tokens": 299521680.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.056048244058176655, |
| "grad_norm": 0.23763810098171234, |
| "learning_rate": 2.9129542891051922e-05, |
| "loss": 0.3902, |
| "num_tokens": 300161168.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.05616648929880572, |
| "grad_norm": 0.21108706295490265, |
| "learning_rate": 2.9027072287543666e-05, |
| "loss": 0.3795, |
| "num_tokens": 300793878.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.056284734539434784, |
| "grad_norm": 0.22178317606449127, |
| "learning_rate": 2.89246601417823e-05, |
| "loss": 0.4045, |
| "num_tokens": 301427428.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.05640297978006385, |
| "grad_norm": 0.23109114170074463, |
| "learning_rate": 2.8822307960662403e-05, |
| "loss": 0.4327, |
| "num_tokens": 302063334.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.056521225020692914, |
| "grad_norm": 0.23473629355430603, |
| "learning_rate": 2.8720017250196266e-05, |
| "loss": 0.3764, |
| "num_tokens": 302696034.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.05663947026132198, |
| "grad_norm": 0.22509372234344482, |
| "learning_rate": 2.861778951549167e-05, |
| "loss": 0.4, |
| "num_tokens": 303331655.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.056757715501951043, |
| "grad_norm": 0.2507939040660858, |
| "learning_rate": 2.851562626072978e-05, |
| "loss": 0.4712, |
| "num_tokens": 303962802.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.05687596074258011, |
| "grad_norm": 0.22742438316345215, |
| "learning_rate": 2.8413528989143004e-05, |
| "loss": 0.3897, |
| "num_tokens": 304595703.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.05699420598320917, |
| "grad_norm": 0.2183639109134674, |
| "learning_rate": 2.8311499202992885e-05, |
| "loss": 0.3931, |
| "num_tokens": 305227732.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.05711245122383824, |
| "grad_norm": 0.21615217626094818, |
| "learning_rate": 2.820953840354795e-05, |
| "loss": 0.3938, |
| "num_tokens": 305861092.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0572306964644673, |
| "grad_norm": 0.22431129217147827, |
| "learning_rate": 2.810764809106168e-05, |
| "loss": 0.3977, |
| "num_tokens": 306497173.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.05734894170509637, |
| "grad_norm": 0.2126999795436859, |
| "learning_rate": 2.800582976475041e-05, |
| "loss": 0.3847, |
| "num_tokens": 307133773.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.05746718694572543, |
| "grad_norm": 0.21983444690704346, |
| "learning_rate": 2.7904084922771254e-05, |
| "loss": 0.3773, |
| "num_tokens": 307760142.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.0575854321863545, |
| "grad_norm": 0.20621925592422485, |
| "learning_rate": 2.7802415062200087e-05, |
| "loss": 0.4089, |
| "num_tokens": 308392658.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.05770367742698356, |
| "grad_norm": 0.2080400586128235, |
| "learning_rate": 2.77008216790095e-05, |
| "loss": 0.3654, |
| "num_tokens": 309023200.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.05782192266761263, |
| "grad_norm": 0.21669632196426392, |
| "learning_rate": 2.759930626804681e-05, |
| "loss": 0.4097, |
| "num_tokens": 309657191.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.05794016790824169, |
| "grad_norm": 0.2028190642595291, |
| "learning_rate": 2.7497870323012014e-05, |
| "loss": 0.4037, |
| "num_tokens": 310290361.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.05805841314887076, |
| "grad_norm": 0.23138827085494995, |
| "learning_rate": 2.7396515336435878e-05, |
| "loss": 0.4207, |
| "num_tokens": 310922697.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.05817665838949982, |
| "grad_norm": 0.23582817614078522, |
| "learning_rate": 2.7295242799657938e-05, |
| "loss": 0.4111, |
| "num_tokens": 311557453.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.05829490363012889, |
| "grad_norm": 0.20863734185695648, |
| "learning_rate": 2.7194054202804555e-05, |
| "loss": 0.4126, |
| "num_tokens": 312193193.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.05841314887075795, |
| "grad_norm": 0.21243295073509216, |
| "learning_rate": 2.709295103476699e-05, |
| "loss": 0.4107, |
| "num_tokens": 312828473.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.05853139411138702, |
| "grad_norm": 0.21561166644096375, |
| "learning_rate": 2.6991934783179515e-05, |
| "loss": 0.3824, |
| "num_tokens": 313462344.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.05864963935201608, |
| "grad_norm": 0.23026616871356964, |
| "learning_rate": 2.6891006934397505e-05, |
| "loss": 0.3821, |
| "num_tokens": 314080640.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.05876788459264515, |
| "grad_norm": 0.2129206657409668, |
| "learning_rate": 2.6790168973475585e-05, |
| "loss": 0.3938, |
| "num_tokens": 314717785.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.05888612983327421, |
| "grad_norm": 0.23650778830051422, |
| "learning_rate": 2.6689422384145744e-05, |
| "loss": 0.4503, |
| "num_tokens": 315351322.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.05900437507390328, |
| "grad_norm": 0.20518648624420166, |
| "learning_rate": 2.658876864879555e-05, |
| "loss": 0.4028, |
| "num_tokens": 315987690.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.05912262031453234, |
| "grad_norm": 191.65394592285156, |
| "learning_rate": 2.648820924844631e-05, |
| "loss": 4.9729, |
| "num_tokens": 316588692.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05924086555516141, |
| "grad_norm": 0.27919653058052063, |
| "learning_rate": 2.6387745662731268e-05, |
| "loss": 0.3813, |
| "num_tokens": 317208507.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.05935911079579047, |
| "grad_norm": 0.23787546157836914, |
| "learning_rate": 2.6287379369873878e-05, |
| "loss": 0.4319, |
| "num_tokens": 317844277.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.059477356036419536, |
| "grad_norm": 0.24857866764068604, |
| "learning_rate": 2.6187111846666015e-05, |
| "loss": 0.4168, |
| "num_tokens": 318478032.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.0595956012770486, |
| "grad_norm": 0.2471940517425537, |
| "learning_rate": 2.6086944568446233e-05, |
| "loss": 0.4461, |
| "num_tokens": 319114663.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.059713846517677666, |
| "grad_norm": 0.23387108743190765, |
| "learning_rate": 2.5986879009078095e-05, |
| "loss": 0.3444, |
| "num_tokens": 319744940.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.05983209175830673, |
| "grad_norm": 0.2251531183719635, |
| "learning_rate": 2.5886916640928474e-05, |
| "loss": 0.3914, |
| "num_tokens": 320377220.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.059950336998935795, |
| "grad_norm": 0.22722141444683075, |
| "learning_rate": 2.57870589348459e-05, |
| "loss": 0.3942, |
| "num_tokens": 321010441.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.060068582239564856, |
| "grad_norm": 0.22561167180538177, |
| "learning_rate": 2.568730736013887e-05, |
| "loss": 0.3771, |
| "num_tokens": 321643644.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.060186827480193925, |
| "grad_norm": 0.22242951393127441, |
| "learning_rate": 2.5587663384554264e-05, |
| "loss": 0.3877, |
| "num_tokens": 322275355.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.060305072720822986, |
| "grad_norm": 0.22118404507637024, |
| "learning_rate": 2.5488128474255777e-05, |
| "loss": 0.4112, |
| "num_tokens": 322908591.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.060423317961452054, |
| "grad_norm": 0.2330280840396881, |
| "learning_rate": 2.5388704093802296e-05, |
| "loss": 0.4106, |
| "num_tokens": 323542459.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.060541563202081115, |
| "grad_norm": 0.2232893407344818, |
| "learning_rate": 2.5289391706126375e-05, |
| "loss": 0.3905, |
| "num_tokens": 324176254.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.060659808442710184, |
| "grad_norm": 0.20871341228485107, |
| "learning_rate": 2.5190192772512675e-05, |
| "loss": 0.3664, |
| "num_tokens": 324803884.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.060778053683339245, |
| "grad_norm": 0.2513749599456787, |
| "learning_rate": 2.509110875257654e-05, |
| "loss": 0.4212, |
| "num_tokens": 325442394.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.06089629892396831, |
| "grad_norm": 0.2195710688829422, |
| "learning_rate": 2.4992141104242444e-05, |
| "loss": 0.4119, |
| "num_tokens": 326081004.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.061014544164597374, |
| "grad_norm": 0.2243558168411255, |
| "learning_rate": 2.4893291283722552e-05, |
| "loss": 0.3926, |
| "num_tokens": 326711646.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.06113278940522644, |
| "grad_norm": 0.2674740254878998, |
| "learning_rate": 2.479456074549534e-05, |
| "loss": 0.387, |
| "num_tokens": 327346340.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.061251034645855504, |
| "grad_norm": 0.19878649711608887, |
| "learning_rate": 2.469595094228415e-05, |
| "loss": 0.3669, |
| "num_tokens": 327982364.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.06136927988648457, |
| "grad_norm": 0.24535562098026276, |
| "learning_rate": 2.4597463325035814e-05, |
| "loss": 0.4298, |
| "num_tokens": 328613513.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.06148752512711363, |
| "grad_norm": 0.2603405714035034, |
| "learning_rate": 2.4499099342899335e-05, |
| "loss": 0.4045, |
| "num_tokens": 329246797.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.0616057703677427, |
| "grad_norm": 0.2385442852973938, |
| "learning_rate": 2.4400860443204524e-05, |
| "loss": 0.4146, |
| "num_tokens": 329882051.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.06172401560837176, |
| "grad_norm": 0.2290627807378769, |
| "learning_rate": 2.4302748071440763e-05, |
| "loss": 0.3872, |
| "num_tokens": 330518781.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.06184226084900083, |
| "grad_norm": 0.22756050527095795, |
| "learning_rate": 2.4204763671235655e-05, |
| "loss": 0.3926, |
| "num_tokens": 331148822.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.06196050608962989, |
| "grad_norm": 0.24352355301380157, |
| "learning_rate": 2.4106908684333856e-05, |
| "loss": 0.4102, |
| "num_tokens": 331779530.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.06207875133025896, |
| "grad_norm": 0.24109165370464325, |
| "learning_rate": 2.4009184550575824e-05, |
| "loss": 0.3531, |
| "num_tokens": 332409781.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.06219699657088802, |
| "grad_norm": 0.2488730251789093, |
| "learning_rate": 2.3911592707876643e-05, |
| "loss": 0.4369, |
| "num_tokens": 333047880.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.06231524181151708, |
| "grad_norm": 0.19400961697101593, |
| "learning_rate": 2.381413459220485e-05, |
| "loss": 0.4013, |
| "num_tokens": 333683586.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.06243348705214615, |
| "grad_norm": 0.23837246000766754, |
| "learning_rate": 2.371681163756134e-05, |
| "loss": 0.4032, |
| "num_tokens": 334315506.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.06255173229277522, |
| "grad_norm": 0.21308279037475586, |
| "learning_rate": 2.361962527595824e-05, |
| "loss": 0.3715, |
| "num_tokens": 334951291.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.06266997753340428, |
| "grad_norm": 0.2166317254304886, |
| "learning_rate": 2.352257693739783e-05, |
| "loss": 0.3831, |
| "num_tokens": 335585867.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.06278822277403334, |
| "grad_norm": 0.22203749418258667, |
| "learning_rate": 2.3425668049851535e-05, |
| "loss": 0.3727, |
| "num_tokens": 336220052.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.0629064680146624, |
| "grad_norm": 0.20896011590957642, |
| "learning_rate": 2.3328900039238882e-05, |
| "loss": 0.4235, |
| "num_tokens": 336856340.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.06302471325529148, |
| "grad_norm": 0.19895263016223907, |
| "learning_rate": 2.323227432940654e-05, |
| "loss": 0.3766, |
| "num_tokens": 337491980.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.06314295849592054, |
| "grad_norm": 0.21203070878982544, |
| "learning_rate": 2.3135792342107335e-05, |
| "loss": 0.3798, |
| "num_tokens": 338130649.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.0632612037365496, |
| "grad_norm": 0.24771364033222198, |
| "learning_rate": 2.3039455496979403e-05, |
| "loss": 0.4252, |
| "num_tokens": 338765865.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.06337944897717866, |
| "grad_norm": 0.2177121639251709, |
| "learning_rate": 2.294326521152522e-05, |
| "loss": 0.3969, |
| "num_tokens": 339363366.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.06349769421780774, |
| "grad_norm": 0.2345789521932602, |
| "learning_rate": 2.2847222901090787e-05, |
| "loss": 0.4415, |
| "num_tokens": 339999650.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.0636159394584368, |
| "grad_norm": 0.21585899591445923, |
| "learning_rate": 2.2751329978844802e-05, |
| "loss": 0.4005, |
| "num_tokens": 340634297.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.06373418469906586, |
| "grad_norm": 0.23928019404411316, |
| "learning_rate": 2.2655587855757862e-05, |
| "loss": 0.4249, |
| "num_tokens": 341269246.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.06385242993969492, |
| "grad_norm": 0.2342565506696701, |
| "learning_rate": 2.255999794058169e-05, |
| "loss": 0.4108, |
| "num_tokens": 341900107.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.063970675180324, |
| "grad_norm": 0.2086341232061386, |
| "learning_rate": 2.246456163982845e-05, |
| "loss": 0.4149, |
| "num_tokens": 342539152.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.06408892042095306, |
| "grad_norm": 0.20828045904636383, |
| "learning_rate": 2.236928035774997e-05, |
| "loss": 0.4131, |
| "num_tokens": 343166271.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.06420716566158212, |
| "grad_norm": 0.20667296648025513, |
| "learning_rate": 2.2274155496317174e-05, |
| "loss": 0.3735, |
| "num_tokens": 343801657.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.06432541090221118, |
| "grad_norm": 0.20303893089294434, |
| "learning_rate": 2.217918845519939e-05, |
| "loss": 0.3877, |
| "num_tokens": 344436926.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.06444365614284026, |
| "grad_norm": 0.1928926706314087, |
| "learning_rate": 2.208438063174377e-05, |
| "loss": 0.3732, |
| "num_tokens": 345071661.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.06456190138346932, |
| "grad_norm": 0.24890753626823425, |
| "learning_rate": 2.1989733420954752e-05, |
| "loss": 0.4558, |
| "num_tokens": 345710262.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.06468014662409838, |
| "grad_norm": 0.21143190562725067, |
| "learning_rate": 2.1895248215473494e-05, |
| "loss": 0.396, |
| "num_tokens": 346345760.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.06479839186472744, |
| "grad_norm": 0.20359720289707184, |
| "learning_rate": 2.1800926405557425e-05, |
| "loss": 0.3731, |
| "num_tokens": 346978472.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.06491663710535651, |
| "grad_norm": 0.22057001292705536, |
| "learning_rate": 2.1706769379059748e-05, |
| "loss": 0.3875, |
| "num_tokens": 347615067.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.06503488234598558, |
| "grad_norm": 0.21384279429912567, |
| "learning_rate": 2.161277852140905e-05, |
| "loss": 0.4085, |
| "num_tokens": 348251545.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06515312758661464, |
| "grad_norm": 0.2024473398923874, |
| "learning_rate": 2.151895521558892e-05, |
| "loss": 0.3993, |
| "num_tokens": 348888946.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.0652713728272437, |
| "grad_norm": 0.23349706828594208, |
| "learning_rate": 2.1425300842117537e-05, |
| "loss": 0.4371, |
| "num_tokens": 349519613.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.06538961806787277, |
| "grad_norm": 0.20403353869915009, |
| "learning_rate": 2.133181677902747e-05, |
| "loss": 0.39, |
| "num_tokens": 350152254.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.06550786330850183, |
| "grad_norm": 0.21594958007335663, |
| "learning_rate": 2.1238504401845306e-05, |
| "loss": 0.3878, |
| "num_tokens": 350786547.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.0656261085491309, |
| "grad_norm": 0.2234022170305252, |
| "learning_rate": 2.1145365083571418e-05, |
| "loss": 0.3961, |
| "num_tokens": 351422383.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.06574435378975996, |
| "grad_norm": 0.21230868995189667, |
| "learning_rate": 2.105240019465984e-05, |
| "loss": 0.4062, |
| "num_tokens": 352061087.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.06586259903038903, |
| "grad_norm": 0.21539685130119324, |
| "learning_rate": 2.095961110299799e-05, |
| "loss": 0.3688, |
| "num_tokens": 352697788.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.0659808442710181, |
| "grad_norm": 0.2293699085712433, |
| "learning_rate": 2.086699917388664e-05, |
| "loss": 0.3892, |
| "num_tokens": 353327836.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.06609908951164715, |
| "grad_norm": 0.2052561193704605, |
| "learning_rate": 2.0774565770019797e-05, |
| "loss": 0.38, |
| "num_tokens": 353963351.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.06621733475227622, |
| "grad_norm": 0.2142726480960846, |
| "learning_rate": 2.06823122514646e-05, |
| "loss": 0.4144, |
| "num_tokens": 354576403.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.06633557999290529, |
| "grad_norm": 0.22821441292762756, |
| "learning_rate": 2.0590239975641387e-05, |
| "loss": 0.4167, |
| "num_tokens": 355211385.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.06645382523353435, |
| "grad_norm": 0.2132827341556549, |
| "learning_rate": 2.0498350297303682e-05, |
| "loss": 0.4177, |
| "num_tokens": 355842242.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.06657207047416341, |
| "grad_norm": 0.21103453636169434, |
| "learning_rate": 2.0406644568518244e-05, |
| "loss": 0.3693, |
| "num_tokens": 356473678.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.06669031571479248, |
| "grad_norm": 0.20970512926578522, |
| "learning_rate": 2.031512413864523e-05, |
| "loss": 0.3613, |
| "num_tokens": 357112492.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.06680856095542155, |
| "grad_norm": 0.25300124287605286, |
| "learning_rate": 2.0223790354318263e-05, |
| "loss": 0.3903, |
| "num_tokens": 357748442.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.06692680619605061, |
| "grad_norm": 0.18217869102954865, |
| "learning_rate": 2.013264455942469e-05, |
| "loss": 0.3531, |
| "num_tokens": 358387012.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.06704505143667967, |
| "grad_norm": 0.202724426984787, |
| "learning_rate": 2.0041688095085776e-05, |
| "loss": 0.3667, |
| "num_tokens": 359025550.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.06716329667730873, |
| "grad_norm": 0.2007199376821518, |
| "learning_rate": 1.9950922299636945e-05, |
| "loss": 0.407, |
| "num_tokens": 359661915.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.06728154191793781, |
| "grad_norm": 0.212602898478508, |
| "learning_rate": 1.986034850860815e-05, |
| "loss": 0.3709, |
| "num_tokens": 360295739.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.06739978715856687, |
| "grad_norm": 0.20929577946662903, |
| "learning_rate": 1.9769968054704174e-05, |
| "loss": 0.4242, |
| "num_tokens": 360929829.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.06751803239919593, |
| "grad_norm": 0.19647814333438873, |
| "learning_rate": 1.9679782267785006e-05, |
| "loss": 0.3632, |
| "num_tokens": 361568418.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.067636277639825, |
| "grad_norm": 0.22293861210346222, |
| "learning_rate": 1.9589792474846353e-05, |
| "loss": 0.3513, |
| "num_tokens": 362197303.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.06775452288045407, |
| "grad_norm": 0.24212084710597992, |
| "learning_rate": 1.9500000000000006e-05, |
| "loss": 0.393, |
| "num_tokens": 362825904.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.06787276812108313, |
| "grad_norm": 0.20932357013225555, |
| "learning_rate": 1.9410406164454458e-05, |
| "loss": 0.3854, |
| "num_tokens": 363465140.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.06799101336171219, |
| "grad_norm": 0.20063243806362152, |
| "learning_rate": 1.9321012286495403e-05, |
| "loss": 0.3874, |
| "num_tokens": 364097168.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.06810925860234125, |
| "grad_norm": 0.1878458708524704, |
| "learning_rate": 1.9231819681466337e-05, |
| "loss": 0.3658, |
| "num_tokens": 364728470.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.06822750384297033, |
| "grad_norm": 0.2246370166540146, |
| "learning_rate": 1.914282966174925e-05, |
| "loss": 0.4268, |
| "num_tokens": 365363497.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.06834574908359939, |
| "grad_norm": 0.24067296087741852, |
| "learning_rate": 1.9054043536745268e-05, |
| "loss": 0.4456, |
| "num_tokens": 366000699.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.06846399432422845, |
| "grad_norm": 0.18838095664978027, |
| "learning_rate": 1.8965462612855428e-05, |
| "loss": 0.3526, |
| "num_tokens": 366624851.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.06858223956485751, |
| "grad_norm": 0.1913178265094757, |
| "learning_rate": 1.8877088193461407e-05, |
| "loss": 0.3845, |
| "num_tokens": 367261099.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.06870048480548657, |
| "grad_norm": 0.20684710144996643, |
| "learning_rate": 1.878892157890638e-05, |
| "loss": 0.3567, |
| "num_tokens": 367897458.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.06881873004611565, |
| "grad_norm": 0.21800653636455536, |
| "learning_rate": 1.8700964066475868e-05, |
| "loss": 0.4243, |
| "num_tokens": 368534927.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.06893697528674471, |
| "grad_norm": 0.21104471385478973, |
| "learning_rate": 1.86132169503787e-05, |
| "loss": 0.4147, |
| "num_tokens": 369169358.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.06905522052737377, |
| "grad_norm": 0.20770899951457977, |
| "learning_rate": 1.8525681521727856e-05, |
| "loss": 0.405, |
| "num_tokens": 369806601.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.06917346576800283, |
| "grad_norm": 0.20592570304870605, |
| "learning_rate": 1.8438359068521625e-05, |
| "loss": 0.3933, |
| "num_tokens": 370442728.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.0692917110086319, |
| "grad_norm": 0.20783546566963196, |
| "learning_rate": 1.83512508756245e-05, |
| "loss": 0.4044, |
| "num_tokens": 371079275.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.06940995624926097, |
| "grad_norm": 0.20856884121894836, |
| "learning_rate": 1.8264358224748374e-05, |
| "loss": 0.3986, |
| "num_tokens": 371716282.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.06952820148989003, |
| "grad_norm": 0.19124871492385864, |
| "learning_rate": 1.817768239443367e-05, |
| "loss": 0.4001, |
| "num_tokens": 372347661.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.06964644673051909, |
| "grad_norm": 0.22391672432422638, |
| "learning_rate": 1.8091224660030457e-05, |
| "loss": 0.3906, |
| "num_tokens": 372977936.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.06976469197114817, |
| "grad_norm": 0.22682306170463562, |
| "learning_rate": 1.8004986293679783e-05, |
| "loss": 0.4097, |
| "num_tokens": 373613192.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.06988293721177723, |
| "grad_norm": 0.1943192332983017, |
| "learning_rate": 1.79189685642949e-05, |
| "loss": 0.4018, |
| "num_tokens": 374251716.0, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.07000118245240629, |
| "grad_norm": 0.1957077533006668, |
| "learning_rate": 1.7833172737542572e-05, |
| "loss": 0.359, |
| "num_tokens": 374880470.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.07011942769303535, |
| "grad_norm": 0.21087896823883057, |
| "learning_rate": 1.774760007582453e-05, |
| "loss": 0.399, |
| "num_tokens": 375518014.0, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.07023767293366442, |
| "grad_norm": 0.20506969094276428, |
| "learning_rate": 1.76622518382588e-05, |
| "loss": 0.4005, |
| "num_tokens": 376156899.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.07035591817429349, |
| "grad_norm": 0.18575182557106018, |
| "learning_rate": 1.7577129280661264e-05, |
| "loss": 0.3837, |
| "num_tokens": 376796416.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.07047416341492255, |
| "grad_norm": 0.20734459161758423, |
| "learning_rate": 1.7492233655527138e-05, |
| "loss": 0.3834, |
| "num_tokens": 377430299.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.07059240865555161, |
| "grad_norm": 0.18199484050273895, |
| "learning_rate": 1.7407566212012526e-05, |
| "loss": 0.3334, |
| "num_tokens": 378036010.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.07071065389618068, |
| "grad_norm": 0.21089966595172882, |
| "learning_rate": 1.7323128195916088e-05, |
| "loss": 0.4233, |
| "num_tokens": 378662576.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.07082889913680974, |
| "grad_norm": 0.19139453768730164, |
| "learning_rate": 1.723892084966068e-05, |
| "loss": 0.3544, |
| "num_tokens": 379292706.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.0709471443774388, |
| "grad_norm": 0.20988748967647552, |
| "learning_rate": 1.7154945412275056e-05, |
| "loss": 0.4113, |
| "num_tokens": 379923752.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07106538961806787, |
| "grad_norm": 0.21000663936138153, |
| "learning_rate": 1.7071203119375692e-05, |
| "loss": 0.3831, |
| "num_tokens": 380556540.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.07118363485869694, |
| "grad_norm": 0.187398761510849, |
| "learning_rate": 1.698769520314853e-05, |
| "loss": 0.3572, |
| "num_tokens": 381191645.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.071301880099326, |
| "grad_norm": 0.1953067183494568, |
| "learning_rate": 1.6904422892330918e-05, |
| "loss": 0.4128, |
| "num_tokens": 381827763.0, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.07142012533995507, |
| "grad_norm": 0.19437581300735474, |
| "learning_rate": 1.68213874121935e-05, |
| "loss": 0.379, |
| "num_tokens": 382466825.0, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.07153837058058413, |
| "grad_norm": 0.21022436022758484, |
| "learning_rate": 1.6738589984522172e-05, |
| "loss": 0.3804, |
| "num_tokens": 383103907.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.0716566158212132, |
| "grad_norm": 0.2030460089445114, |
| "learning_rate": 1.665603182760014e-05, |
| "loss": 0.4009, |
| "num_tokens": 383736705.0, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.07177486106184226, |
| "grad_norm": 0.21273180842399597, |
| "learning_rate": 1.657371415618996e-05, |
| "loss": 0.4078, |
| "num_tokens": 384376310.0, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.07189310630247132, |
| "grad_norm": 0.184920996427536, |
| "learning_rate": 1.6491638181515668e-05, |
| "loss": 0.3793, |
| "num_tokens": 385007094.0, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.07201135154310039, |
| "grad_norm": 0.18787135183811188, |
| "learning_rate": 1.6409805111245015e-05, |
| "loss": 0.3604, |
| "num_tokens": 385646534.0, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.07212959678372946, |
| "grad_norm": 0.18394650518894196, |
| "learning_rate": 1.632821614947159e-05, |
| "loss": 0.3549, |
| "num_tokens": 386283106.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.07224784202435852, |
| "grad_norm": 0.18069717288017273, |
| "learning_rate": 1.624687249669722e-05, |
| "loss": 0.3509, |
| "num_tokens": 386916395.0, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.07236608726498758, |
| "grad_norm": 0.2196332812309265, |
| "learning_rate": 1.6165775349814197e-05, |
| "loss": 0.3995, |
| "num_tokens": 387553614.0, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.07248433250561664, |
| "grad_norm": 0.20063099265098572, |
| "learning_rate": 1.608492590208777e-05, |
| "loss": 0.3657, |
| "num_tokens": 388189908.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.07260257774624572, |
| "grad_norm": 0.2032419592142105, |
| "learning_rate": 1.6004325343138506e-05, |
| "loss": 0.4057, |
| "num_tokens": 388827274.0, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.07272082298687478, |
| "grad_norm": 0.2009783685207367, |
| "learning_rate": 1.5923974858924816e-05, |
| "loss": 0.3746, |
| "num_tokens": 389460786.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.07283906822750384, |
| "grad_norm": 0.19908788800239563, |
| "learning_rate": 1.5843875631725528e-05, |
| "loss": 0.3981, |
| "num_tokens": 390066154.0, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.0729573134681329, |
| "grad_norm": 0.1834346354007721, |
| "learning_rate": 1.5764028840122463e-05, |
| "loss": 0.3648, |
| "num_tokens": 390700370.0, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.07307555870876198, |
| "grad_norm": 0.2006380409002304, |
| "learning_rate": 1.568443565898307e-05, |
| "loss": 0.3693, |
| "num_tokens": 391333815.0, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.07319380394939104, |
| "grad_norm": 0.1987355351448059, |
| "learning_rate": 1.5605097259443196e-05, |
| "loss": 0.3864, |
| "num_tokens": 391972641.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.0733120491900201, |
| "grad_norm": 0.20351499319076538, |
| "learning_rate": 1.5526014808889836e-05, |
| "loss": 0.4113, |
| "num_tokens": 392607122.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.07343029443064916, |
| "grad_norm": 0.20003947615623474, |
| "learning_rate": 1.5447189470943905e-05, |
| "loss": 0.3607, |
| "num_tokens": 393234821.0, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.07354853967127824, |
| "grad_norm": 0.20422472059726715, |
| "learning_rate": 1.536862240544321e-05, |
| "loss": 0.3633, |
| "num_tokens": 393867338.0, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.0736667849119073, |
| "grad_norm": 0.18243864178657532, |
| "learning_rate": 1.5290314768425274e-05, |
| "loss": 0.3752, |
| "num_tokens": 394505968.0, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.07378503015253636, |
| "grad_norm": 0.22229041159152985, |
| "learning_rate": 1.5212267712110427e-05, |
| "loss": 0.4205, |
| "num_tokens": 395143798.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.07390327539316542, |
| "grad_norm": 0.19298569858074188, |
| "learning_rate": 1.5134482384884803e-05, |
| "loss": 0.368, |
| "num_tokens": 395781916.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.0740215206337945, |
| "grad_norm": 0.20785243809223175, |
| "learning_rate": 1.5056959931283423e-05, |
| "loss": 0.4121, |
| "num_tokens": 396419440.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.07413976587442356, |
| "grad_norm": 0.19097504019737244, |
| "learning_rate": 1.4979701491973403e-05, |
| "loss": 0.3539, |
| "num_tokens": 397053137.0, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.07425801111505262, |
| "grad_norm": 0.2216179519891739, |
| "learning_rate": 1.490270820373715e-05, |
| "loss": 0.3927, |
| "num_tokens": 397685003.0, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.07437625635568168, |
| "grad_norm": 0.21564562618732452, |
| "learning_rate": 1.4825981199455601e-05, |
| "loss": 0.4046, |
| "num_tokens": 398323686.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.07449450159631076, |
| "grad_norm": 0.20918431878089905, |
| "learning_rate": 1.4749521608091632e-05, |
| "loss": 0.4025, |
| "num_tokens": 398958685.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.07461274683693982, |
| "grad_norm": 0.2055424153804779, |
| "learning_rate": 1.4673330554673358e-05, |
| "loss": 0.3961, |
| "num_tokens": 399595823.0, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.07473099207756888, |
| "grad_norm": 0.19133684039115906, |
| "learning_rate": 1.459740916027765e-05, |
| "loss": 0.3868, |
| "num_tokens": 400234363.0, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.07484923731819794, |
| "grad_norm": 0.20725229382514954, |
| "learning_rate": 1.4521758542013575e-05, |
| "loss": 0.3999, |
| "num_tokens": 400860312.0, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.074967482558827, |
| "grad_norm": 0.19468720257282257, |
| "learning_rate": 1.4446379813006028e-05, |
| "loss": 0.3931, |
| "num_tokens": 401493314.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.07508572779945608, |
| "grad_norm": 0.21458375453948975, |
| "learning_rate": 1.4371274082379317e-05, |
| "loss": 0.4047, |
| "num_tokens": 402131410.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.07520397304008514, |
| "grad_norm": 0.21077150106430054, |
| "learning_rate": 1.4296442455240818e-05, |
| "loss": 0.4181, |
| "num_tokens": 402767694.0, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.0753222182807142, |
| "grad_norm": 0.21079093217849731, |
| "learning_rate": 1.4221886032664769e-05, |
| "loss": 0.4037, |
| "num_tokens": 403401170.0, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.07544046352134326, |
| "grad_norm": 0.1916537582874298, |
| "learning_rate": 1.4147605911676037e-05, |
| "loss": 0.3909, |
| "num_tokens": 404033223.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.07555870876197233, |
| "grad_norm": 0.17826271057128906, |
| "learning_rate": 1.4073603185233966e-05, |
| "loss": 0.3837, |
| "num_tokens": 404669142.0, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.0756769540026014, |
| "grad_norm": 0.18769319355487823, |
| "learning_rate": 1.3999878942216336e-05, |
| "loss": 0.3976, |
| "num_tokens": 405305698.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.07579519924323046, |
| "grad_norm": 0.21683697402477264, |
| "learning_rate": 1.3926434267403286e-05, |
| "loss": 0.4228, |
| "num_tokens": 405935366.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.07591344448385952, |
| "grad_norm": 0.1858586221933365, |
| "learning_rate": 1.3853270241461407e-05, |
| "loss": 0.3949, |
| "num_tokens": 406563939.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.0760316897244886, |
| "grad_norm": 0.1963283121585846, |
| "learning_rate": 1.378038794092781e-05, |
| "loss": 0.3806, |
| "num_tokens": 407201876.0, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.07614993496511765, |
| "grad_norm": 0.1992059051990509, |
| "learning_rate": 1.3707788438194276e-05, |
| "loss": 0.3715, |
| "num_tokens": 407834876.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.07626818020574672, |
| "grad_norm": 0.19572339951992035, |
| "learning_rate": 1.3635472801491516e-05, |
| "loss": 0.3752, |
| "num_tokens": 408474126.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.07638642544637578, |
| "grad_norm": 0.185529887676239, |
| "learning_rate": 1.3563442094873424e-05, |
| "loss": 0.3354, |
| "num_tokens": 409110752.0, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.07650467068700485, |
| "grad_norm": 0.20446783304214478, |
| "learning_rate": 1.349169737820141e-05, |
| "loss": 0.3986, |
| "num_tokens": 409744230.0, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.07662291592763391, |
| "grad_norm": 0.20862102508544922, |
| "learning_rate": 1.3420239707128845e-05, |
| "loss": 0.3885, |
| "num_tokens": 410377730.0, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.07674116116826298, |
| "grad_norm": 0.19482731819152832, |
| "learning_rate": 1.3349070133085478e-05, |
| "loss": 0.369, |
| "num_tokens": 411014041.0, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.07685940640889204, |
| "grad_norm": 0.1799471527338028, |
| "learning_rate": 1.327818970326202e-05, |
| "loss": 0.377, |
| "num_tokens": 411653738.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07697765164952111, |
| "grad_norm": 0.17572778463363647, |
| "learning_rate": 1.3207599460594695e-05, |
| "loss": 0.347, |
| "num_tokens": 412288459.0, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.07709589689015017, |
| "grad_norm": 0.18439733982086182, |
| "learning_rate": 1.31373004437499e-05, |
| "loss": 0.3861, |
| "num_tokens": 412924573.0, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.07721414213077923, |
| "grad_norm": 0.18092259764671326, |
| "learning_rate": 1.3067293687108938e-05, |
| "loss": 0.3428, |
| "num_tokens": 413557882.0, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.0773323873714083, |
| "grad_norm": 0.17916624248027802, |
| "learning_rate": 1.2997580220752791e-05, |
| "loss": 0.3431, |
| "num_tokens": 414190765.0, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.07745063261203737, |
| "grad_norm": 0.18362957239151, |
| "learning_rate": 1.2928161070446937e-05, |
| "loss": 0.3517, |
| "num_tokens": 414824481.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.07756887785266643, |
| "grad_norm": 0.18938778340816498, |
| "learning_rate": 1.2859037257626331e-05, |
| "loss": 0.3749, |
| "num_tokens": 415462470.0, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.0776871230932955, |
| "grad_norm": 0.18327617645263672, |
| "learning_rate": 1.2790209799380269e-05, |
| "loss": 0.4054, |
| "num_tokens": 416098823.0, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.07780536833392455, |
| "grad_norm": 0.18833239376544952, |
| "learning_rate": 1.2721679708437516e-05, |
| "loss": 0.3851, |
| "num_tokens": 416727909.0, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.07792361357455363, |
| "grad_norm": 0.21469521522521973, |
| "learning_rate": 1.2653447993151367e-05, |
| "loss": 0.4095, |
| "num_tokens": 417362676.0, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.07804185881518269, |
| "grad_norm": 0.1744416207075119, |
| "learning_rate": 1.2585515657484778e-05, |
| "loss": 0.3625, |
| "num_tokens": 417996258.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.07816010405581175, |
| "grad_norm": 0.19375596940517426, |
| "learning_rate": 1.2517883700995673e-05, |
| "loss": 0.4059, |
| "num_tokens": 418626034.0, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.07827834929644081, |
| "grad_norm": 0.19247286021709442, |
| "learning_rate": 1.2450553118822141e-05, |
| "loss": 0.4297, |
| "num_tokens": 419263225.0, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.07839659453706989, |
| "grad_norm": 0.18751764297485352, |
| "learning_rate": 1.238352490166789e-05, |
| "loss": 0.3912, |
| "num_tokens": 419892898.0, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.07851483977769895, |
| "grad_norm": 0.1725941002368927, |
| "learning_rate": 1.2316800035787598e-05, |
| "loss": 0.3779, |
| "num_tokens": 420527528.0, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.07863308501832801, |
| "grad_norm": 0.17689573764801025, |
| "learning_rate": 1.2250379502972414e-05, |
| "loss": 0.3802, |
| "num_tokens": 421156121.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.07875133025895707, |
| "grad_norm": 0.17760339379310608, |
| "learning_rate": 1.2184264280535551e-05, |
| "loss": 0.3315, |
| "num_tokens": 421790061.0, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.07886957549958615, |
| "grad_norm": 0.18165592849254608, |
| "learning_rate": 1.2118455341297868e-05, |
| "loss": 0.39, |
| "num_tokens": 422426991.0, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.07898782074021521, |
| "grad_norm": 0.19043633341789246, |
| "learning_rate": 1.2052953653573545e-05, |
| "loss": 0.3475, |
| "num_tokens": 423063834.0, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.07910606598084427, |
| "grad_norm": 0.1863887906074524, |
| "learning_rate": 1.1987760181155897e-05, |
| "loss": 0.3814, |
| "num_tokens": 423703537.0, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.07922431122147333, |
| "grad_norm": 0.19008512794971466, |
| "learning_rate": 1.1922875883303112e-05, |
| "loss": 0.3986, |
| "num_tokens": 424330180.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.0793425564621024, |
| "grad_norm": 0.1887669861316681, |
| "learning_rate": 1.1858301714724201e-05, |
| "loss": 0.4111, |
| "num_tokens": 424966976.0, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.07946080170273147, |
| "grad_norm": 0.20428043603897095, |
| "learning_rate": 1.1794038625564926e-05, |
| "loss": 0.3843, |
| "num_tokens": 425604191.0, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.07957904694336053, |
| "grad_norm": 0.20065979659557343, |
| "learning_rate": 1.1730087561393799e-05, |
| "loss": 0.3345, |
| "num_tokens": 426240218.0, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.07969729218398959, |
| "grad_norm": 0.18954698741436005, |
| "learning_rate": 1.1666449463188212e-05, |
| "loss": 0.3979, |
| "num_tokens": 426878525.0, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.07981553742461867, |
| "grad_norm": 0.19051022827625275, |
| "learning_rate": 1.1603125267320565e-05, |
| "loss": 0.3658, |
| "num_tokens": 427512790.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.07993378266524773, |
| "grad_norm": 0.21397797763347626, |
| "learning_rate": 1.1540115905544473e-05, |
| "loss": 0.4099, |
| "num_tokens": 428150456.0, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.08005202790587679, |
| "grad_norm": 0.1928778886795044, |
| "learning_rate": 1.1477422304981104e-05, |
| "loss": 0.3455, |
| "num_tokens": 428783253.0, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.08017027314650585, |
| "grad_norm": 0.19302059710025787, |
| "learning_rate": 1.1415045388105477e-05, |
| "loss": 0.3846, |
| "num_tokens": 429419007.0, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.08028851838713492, |
| "grad_norm": 0.190629780292511, |
| "learning_rate": 1.1352986072732943e-05, |
| "loss": 0.3779, |
| "num_tokens": 430051255.0, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.08040676362776399, |
| "grad_norm": 0.22666533291339874, |
| "learning_rate": 1.1291245272005658e-05, |
| "loss": 0.4233, |
| "num_tokens": 430683994.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.08052500886839305, |
| "grad_norm": 0.2007281333208084, |
| "learning_rate": 1.1229823894379133e-05, |
| "loss": 0.3534, |
| "num_tokens": 431314161.0, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.08064325410902211, |
| "grad_norm": 0.2027830183506012, |
| "learning_rate": 1.1168722843608897e-05, |
| "loss": 0.3763, |
| "num_tokens": 431943550.0, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.08076149934965117, |
| "grad_norm": 0.18750514090061188, |
| "learning_rate": 1.1107943018737158e-05, |
| "loss": 0.3677, |
| "num_tokens": 432580022.0, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.08087974459028024, |
| "grad_norm": 0.20712460577487946, |
| "learning_rate": 1.104748531407962e-05, |
| "loss": 0.4149, |
| "num_tokens": 433215860.0, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.0809979898309093, |
| "grad_norm": 0.21580064296722412, |
| "learning_rate": 1.0987350619212307e-05, |
| "loss": 0.3697, |
| "num_tokens": 433849766.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.08111623507153837, |
| "grad_norm": 0.16628190875053406, |
| "learning_rate": 1.0927539818958437e-05, |
| "loss": 0.348, |
| "num_tokens": 434484743.0, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.08123448031216743, |
| "grad_norm": 0.2060742974281311, |
| "learning_rate": 1.0868053793375467e-05, |
| "loss": 0.3591, |
| "num_tokens": 435100372.0, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.0813527255527965, |
| "grad_norm": 0.2135939598083496, |
| "learning_rate": 1.0808893417742116e-05, |
| "loss": 0.4258, |
| "num_tokens": 435733891.0, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.08147097079342557, |
| "grad_norm": 0.1941777616739273, |
| "learning_rate": 1.0750059562545451e-05, |
| "loss": 0.3644, |
| "num_tokens": 436365690.0, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.08158921603405463, |
| "grad_norm": 0.17885838449001312, |
| "learning_rate": 1.0691553093468144e-05, |
| "loss": 0.3508, |
| "num_tokens": 437003639.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.08170746127468369, |
| "grad_norm": 0.18553341925144196, |
| "learning_rate": 1.0633374871375666e-05, |
| "loss": 0.3832, |
| "num_tokens": 437642920.0, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.08182570651531276, |
| "grad_norm": 0.2075071483850479, |
| "learning_rate": 1.0575525752303687e-05, |
| "loss": 0.3829, |
| "num_tokens": 438277063.0, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.08194395175594182, |
| "grad_norm": 0.21307510137557983, |
| "learning_rate": 1.0518006587445431e-05, |
| "loss": 0.3931, |
| "num_tokens": 438915083.0, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.08206219699657089, |
| "grad_norm": 0.20583738386631012, |
| "learning_rate": 1.0460818223139167e-05, |
| "loss": 0.4053, |
| "num_tokens": 439554233.0, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.08218044223719995, |
| "grad_norm": 0.17440171539783478, |
| "learning_rate": 1.0403961500855766e-05, |
| "loss": 0.359, |
| "num_tokens": 440187716.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.08229868747782902, |
| "grad_norm": 0.1777043342590332, |
| "learning_rate": 1.0347437257186311e-05, |
| "loss": 0.3862, |
| "num_tokens": 440823462.0, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.08241693271845808, |
| "grad_norm": 0.18520857393741608, |
| "learning_rate": 1.0291246323829772e-05, |
| "loss": 0.3751, |
| "num_tokens": 441461261.0, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.08253517795908714, |
| "grad_norm": 0.2085760086774826, |
| "learning_rate": 1.0235389527580807e-05, |
| "loss": 0.3989, |
| "num_tokens": 442092406.0, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.0826534231997162, |
| "grad_norm": 0.1899712234735489, |
| "learning_rate": 1.0179867690317546e-05, |
| "loss": 0.4033, |
| "num_tokens": 442729228.0, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.08277166844034528, |
| "grad_norm": 0.19925859570503235, |
| "learning_rate": 1.0124681628989546e-05, |
| "loss": 0.416, |
| "num_tokens": 443368453.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08288991368097434, |
| "grad_norm": 0.1773071438074112, |
| "learning_rate": 1.006983215560575e-05, |
| "loss": 0.3633, |
| "num_tokens": 444004993.0, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.0830081589216034, |
| "grad_norm": 0.20045937597751617, |
| "learning_rate": 1.001532007722252e-05, |
| "loss": 0.4294, |
| "num_tokens": 444641198.0, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.08312640416223246, |
| "grad_norm": 0.18577006459236145, |
| "learning_rate": 9.9611461959318e-06, |
| "loss": 0.3833, |
| "num_tokens": 445272456.0, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.08324464940286154, |
| "grad_norm": 0.21089830994606018, |
| "learning_rate": 9.907311308849286e-06, |
| "loss": 0.4268, |
| "num_tokens": 445909612.0, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.0833628946434906, |
| "grad_norm": 0.22879935801029205, |
| "learning_rate": 9.853816208102698e-06, |
| "loss": 0.4456, |
| "num_tokens": 446544323.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.08348113988411966, |
| "grad_norm": 0.1861100196838379, |
| "learning_rate": 9.800661680820146e-06, |
| "loss": 0.3963, |
| "num_tokens": 447177697.0, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.08359938512474872, |
| "grad_norm": 0.21287429332733154, |
| "learning_rate": 9.747848509118531e-06, |
| "loss": 0.4048, |
| "num_tokens": 447813578.0, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.0837176303653778, |
| "grad_norm": 0.19029271602630615, |
| "learning_rate": 9.69537747009204e-06, |
| "loss": 0.3696, |
| "num_tokens": 448448247.0, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.08383587560600686, |
| "grad_norm": 0.19157418608665466, |
| "learning_rate": 9.643249335800701e-06, |
| "loss": 0.3907, |
| "num_tokens": 449081260.0, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.08395412084663592, |
| "grad_norm": 0.1897335648536682, |
| "learning_rate": 9.591464873259048e-06, |
| "loss": 0.3519, |
| "num_tokens": 449718960.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.08407236608726498, |
| "grad_norm": 0.20246022939682007, |
| "learning_rate": 9.540024844424825e-06, |
| "loss": 0.3647, |
| "num_tokens": 450354221.0, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.08419061132789406, |
| "grad_norm": 0.22009633481502533, |
| "learning_rate": 9.48893000618775e-06, |
| "loss": 0.4162, |
| "num_tokens": 450990864.0, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.08430885656852312, |
| "grad_norm": 0.17846493422985077, |
| "learning_rate": 9.438181110358414e-06, |
| "loss": 0.347, |
| "num_tokens": 451629963.0, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.08442710180915218, |
| "grad_norm": 0.17807744443416595, |
| "learning_rate": 9.387778903657208e-06, |
| "loss": 0.3508, |
| "num_tokens": 452263375.0, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.08454534704978124, |
| "grad_norm": 0.2217930108308792, |
| "learning_rate": 9.337724127703315e-06, |
| "loss": 0.4266, |
| "num_tokens": 452899788.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.08466359229041032, |
| "grad_norm": 0.17611801624298096, |
| "learning_rate": 9.288017519003827e-06, |
| "loss": 0.3527, |
| "num_tokens": 453532800.0, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.08478183753103938, |
| "grad_norm": 0.18967418372631073, |
| "learning_rate": 9.2386598089429e-06, |
| "loss": 0.4013, |
| "num_tokens": 454167051.0, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.08490008277166844, |
| "grad_norm": 0.18361113965511322, |
| "learning_rate": 9.189651723770968e-06, |
| "loss": 0.3954, |
| "num_tokens": 454801891.0, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.0850183280122975, |
| "grad_norm": 0.18380604684352875, |
| "learning_rate": 9.140993984594098e-06, |
| "loss": 0.3798, |
| "num_tokens": 455434940.0, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.08513657325292658, |
| "grad_norm": 0.2047707885503769, |
| "learning_rate": 9.092687307363336e-06, |
| "loss": 0.4165, |
| "num_tokens": 456070522.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.08525481849355564, |
| "grad_norm": 0.1952807605266571, |
| "learning_rate": 9.044732402864214e-06, |
| "loss": 0.4127, |
| "num_tokens": 456700607.0, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.0853730637341847, |
| "grad_norm": 0.20445430278778076, |
| "learning_rate": 8.997129976706273e-06, |
| "loss": 0.3739, |
| "num_tokens": 457333591.0, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.08549130897481376, |
| "grad_norm": 0.18014107644557953, |
| "learning_rate": 8.949880729312658e-06, |
| "loss": 0.3939, |
| "num_tokens": 457972538.0, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.08560955421544283, |
| "grad_norm": 0.18680702149868011, |
| "learning_rate": 8.902985355909854e-06, |
| "loss": 0.3814, |
| "num_tokens": 458608333.0, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.0857277994560719, |
| "grad_norm": 0.1889398694038391, |
| "learning_rate": 8.856444546517439e-06, |
| "loss": 0.3846, |
| "num_tokens": 459238593.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.08584604469670096, |
| "grad_norm": 0.1750420778989792, |
| "learning_rate": 8.810258985937902e-06, |
| "loss": 0.3657, |
| "num_tokens": 459848240.0, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.08596428993733002, |
| "grad_norm": 0.2158506065607071, |
| "learning_rate": 8.764429353746627e-06, |
| "loss": 0.4134, |
| "num_tokens": 460483298.0, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.0860825351779591, |
| "grad_norm": 0.1906134933233261, |
| "learning_rate": 8.71895632428183e-06, |
| "loss": 0.3773, |
| "num_tokens": 461109500.0, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.08620078041858815, |
| "grad_norm": 0.193019300699234, |
| "learning_rate": 8.673840566634688e-06, |
| "loss": 0.3787, |
| "num_tokens": 461746594.0, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.08631902565921722, |
| "grad_norm": 0.18906846642494202, |
| "learning_rate": 8.629082744639463e-06, |
| "loss": 0.3829, |
| "num_tokens": 462380799.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.08643727089984628, |
| "grad_norm": 0.18636515736579895, |
| "learning_rate": 8.584683516863736e-06, |
| "loss": 0.3875, |
| "num_tokens": 463016862.0, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.08655551614047535, |
| "grad_norm": 0.17957797646522522, |
| "learning_rate": 8.540643536598749e-06, |
| "loss": 0.3563, |
| "num_tokens": 463650306.0, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.08667376138110441, |
| "grad_norm": 0.181325301527977, |
| "learning_rate": 8.496963451849745e-06, |
| "loss": 0.3773, |
| "num_tokens": 464282371.0, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.08679200662173348, |
| "grad_norm": 0.19648700952529907, |
| "learning_rate": 8.453643905326459e-06, |
| "loss": 0.3687, |
| "num_tokens": 464918493.0, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.08691025186236254, |
| "grad_norm": 0.19785350561141968, |
| "learning_rate": 8.410685534433676e-06, |
| "loss": 0.3761, |
| "num_tokens": 465551682.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.0870284971029916, |
| "grad_norm": 0.1787901520729065, |
| "learning_rate": 8.368088971261814e-06, |
| "loss": 0.3737, |
| "num_tokens": 466189560.0, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.08714674234362067, |
| "grad_norm": 0.19740906357765198, |
| "learning_rate": 8.32585484257766e-06, |
| "loss": 0.3781, |
| "num_tokens": 466826010.0, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.08726498758424973, |
| "grad_norm": 0.18967872858047485, |
| "learning_rate": 8.28398376981511e-06, |
| "loss": 0.3477, |
| "num_tokens": 467461700.0, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.0873832328248788, |
| "grad_norm": 0.16891902685165405, |
| "learning_rate": 8.242476369066072e-06, |
| "loss": 0.3352, |
| "num_tokens": 468097256.0, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.08750147806550786, |
| "grad_norm": 0.2073381245136261, |
| "learning_rate": 8.20133325107137e-06, |
| "loss": 0.4052, |
| "num_tokens": 468731535.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.08761972330613693, |
| "grad_norm": 0.18397468328475952, |
| "learning_rate": 8.160555021211748e-06, |
| "loss": 0.3544, |
| "num_tokens": 469363357.0, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.087737968546766, |
| "grad_norm": 0.19281727075576782, |
| "learning_rate": 8.12014227949899e-06, |
| "loss": 0.3782, |
| "num_tokens": 469996228.0, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.08785621378739505, |
| "grad_norm": 0.20584794878959656, |
| "learning_rate": 8.080095620567093e-06, |
| "loss": 0.4069, |
| "num_tokens": 470628575.0, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.08797445902802412, |
| "grad_norm": 0.18428972363471985, |
| "learning_rate": 8.040415633663469e-06, |
| "loss": 0.3892, |
| "num_tokens": 471265485.0, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.08809270426865319, |
| "grad_norm": 0.1747061312198639, |
| "learning_rate": 8.001102902640344e-06, |
| "loss": 0.3767, |
| "num_tokens": 471898145.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.08821094950928225, |
| "grad_norm": 0.18705062568187714, |
| "learning_rate": 7.962158005946105e-06, |
| "loss": 0.3754, |
| "num_tokens": 472533209.0, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.08832919474991131, |
| "grad_norm": 0.18788328766822815, |
| "learning_rate": 7.923581516616837e-06, |
| "loss": 0.3855, |
| "num_tokens": 473171790.0, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.08844743999054037, |
| "grad_norm": 0.18790322542190552, |
| "learning_rate": 7.88537400226787e-06, |
| "loss": 0.3487, |
| "num_tokens": 473806600.0, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.08856568523116945, |
| "grad_norm": 0.18705305457115173, |
| "learning_rate": 7.847536025085408e-06, |
| "loss": 0.3834, |
| "num_tokens": 474446221.0, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.08868393047179851, |
| "grad_norm": 0.1689257174730301, |
| "learning_rate": 7.810068141818299e-06, |
| "loss": 0.3533, |
| "num_tokens": 475080946.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.08880217571242757, |
| "grad_norm": 0.18348811566829681, |
| "learning_rate": 7.772970903769814e-06, |
| "loss": 0.3248, |
| "num_tokens": 475715589.0, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.08892042095305663, |
| "grad_norm": 0.194603830575943, |
| "learning_rate": 7.736244856789531e-06, |
| "loss": 0.3856, |
| "num_tokens": 476350099.0, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.08903866619368571, |
| "grad_norm": 0.19097204506397247, |
| "learning_rate": 7.69989054126533e-06, |
| "loss": 0.3998, |
| "num_tokens": 476986608.0, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.08915691143431477, |
| "grad_norm": 0.18063834309577942, |
| "learning_rate": 7.663908492115426e-06, |
| "loss": 0.3828, |
| "num_tokens": 477626286.0, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.08927515667494383, |
| "grad_norm": 0.1803908348083496, |
| "learning_rate": 7.628299238780476e-06, |
| "loss": 0.3851, |
| "num_tokens": 478262327.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.08939340191557289, |
| "grad_norm": 0.18068207800388336, |
| "learning_rate": 7.59306330521584e-06, |
| "loss": 0.3602, |
| "num_tokens": 478899878.0, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.08951164715620197, |
| "grad_norm": 0.1799282282590866, |
| "learning_rate": 7.558201209883818e-06, |
| "loss": 0.3743, |
| "num_tokens": 479538362.0, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.08962989239683103, |
| "grad_norm": 0.1710379421710968, |
| "learning_rate": 7.523713465746072e-06, |
| "loss": 0.3626, |
| "num_tokens": 480177217.0, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.08974813763746009, |
| "grad_norm": 0.18254569172859192, |
| "learning_rate": 7.489600580256027e-06, |
| "loss": 0.3839, |
| "num_tokens": 480809891.0, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.08986638287808915, |
| "grad_norm": 0.19266051054000854, |
| "learning_rate": 7.455863055351445e-06, |
| "loss": 0.3762, |
| "num_tokens": 481446104.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.08998462811871823, |
| "grad_norm": 0.16768276691436768, |
| "learning_rate": 7.422501387447021e-06, |
| "loss": 0.3582, |
| "num_tokens": 482084578.0, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.09010287335934729, |
| "grad_norm": 0.18206870555877686, |
| "learning_rate": 7.389516067427073e-06, |
| "loss": 0.3688, |
| "num_tokens": 482713767.0, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.09022111859997635, |
| "grad_norm": 0.21701638400554657, |
| "learning_rate": 7.356907580638336e-06, |
| "loss": 0.436, |
| "num_tokens": 483353280.0, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.09033936384060541, |
| "grad_norm": 0.15846391022205353, |
| "learning_rate": 7.324676406882817e-06, |
| "loss": 0.3657, |
| "num_tokens": 483985107.0, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.09045760908123449, |
| "grad_norm": 0.21575696766376495, |
| "learning_rate": 7.2928230204107194e-06, |
| "loss": 0.3862, |
| "num_tokens": 484615672.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.09057585432186355, |
| "grad_norm": 0.19652079045772552, |
| "learning_rate": 7.261347889913485e-06, |
| "loss": 0.3826, |
| "num_tokens": 485253394.0, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.09069409956249261, |
| "grad_norm": 0.1919373869895935, |
| "learning_rate": 7.230251478516881e-06, |
| "loss": 0.3903, |
| "num_tokens": 485886884.0, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.09081234480312167, |
| "grad_norm": 0.21124163269996643, |
| "learning_rate": 7.199534243774199e-06, |
| "loss": 0.3766, |
| "num_tokens": 486516495.0, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.09093059004375074, |
| "grad_norm": 0.18964266777038574, |
| "learning_rate": 7.169196637659522e-06, |
| "loss": 0.4244, |
| "num_tokens": 487151670.0, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.0910488352843798, |
| "grad_norm": 0.20490339398384094, |
| "learning_rate": 7.139239106561053e-06, |
| "loss": 0.3828, |
| "num_tokens": 487786678.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.09116708052500887, |
| "grad_norm": 0.20041170716285706, |
| "learning_rate": 7.109662091274574e-06, |
| "loss": 0.3998, |
| "num_tokens": 488423430.0, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.09128532576563793, |
| "grad_norm": 0.17842328548431396, |
| "learning_rate": 7.080466026996954e-06, |
| "loss": 0.3712, |
| "num_tokens": 489055057.0, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.091403571006267, |
| "grad_norm": 0.18228091299533844, |
| "learning_rate": 7.051651343319723e-06, |
| "loss": 0.3632, |
| "num_tokens": 489690318.0, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.09152181624689606, |
| "grad_norm": 0.19914202392101288, |
| "learning_rate": 7.023218464222788e-06, |
| "loss": 0.4109, |
| "num_tokens": 490315503.0, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.09164006148752513, |
| "grad_norm": 0.1682100147008896, |
| "learning_rate": 6.995167808068159e-06, |
| "loss": 0.3356, |
| "num_tokens": 490951658.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.09175830672815419, |
| "grad_norm": 0.18745863437652588, |
| "learning_rate": 6.9674997875938175e-06, |
| "loss": 0.3389, |
| "num_tokens": 491582936.0, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.09187655196878326, |
| "grad_norm": 0.18999552726745605, |
| "learning_rate": 6.940214809907637e-06, |
| "loss": 0.4062, |
| "num_tokens": 492221808.0, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.09199479720941232, |
| "grad_norm": 0.20237648487091064, |
| "learning_rate": 6.913313276481378e-06, |
| "loss": 0.3851, |
| "num_tokens": 492851928.0, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.09211304245004139, |
| "grad_norm": 0.18820145726203918, |
| "learning_rate": 6.886795583144813e-06, |
| "loss": 0.3408, |
| "num_tokens": 493484521.0, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.09223128769067045, |
| "grad_norm": 0.2060716450214386, |
| "learning_rate": 6.860662120079868e-06, |
| "loss": 0.4156, |
| "num_tokens": 494120278.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.09234953293129952, |
| "grad_norm": 0.1769654005765915, |
| "learning_rate": 6.834913271814898e-06, |
| "loss": 0.4094, |
| "num_tokens": 494748375.0, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.09246777817192858, |
| "grad_norm": 0.2025490701198578, |
| "learning_rate": 6.809549417219036e-06, |
| "loss": 0.3979, |
| "num_tokens": 495383913.0, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.09258602341255764, |
| "grad_norm": 0.1910087913274765, |
| "learning_rate": 6.784570929496596e-06, |
| "loss": 0.3656, |
| "num_tokens": 496008813.0, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.0927042686531867, |
| "grad_norm": 0.18994437158107758, |
| "learning_rate": 6.759978176181609e-06, |
| "loss": 0.3939, |
| "num_tokens": 496639648.0, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.09282251389381578, |
| "grad_norm": 0.1858188956975937, |
| "learning_rate": 6.7357715191323985e-06, |
| "loss": 0.3416, |
| "num_tokens": 497274171.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.09294075913444484, |
| "grad_norm": 0.17720621824264526, |
| "learning_rate": 6.711951314526245e-06, |
| "loss": 0.3714, |
| "num_tokens": 497913138.0, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.0930590043750739, |
| "grad_norm": 0.18589583039283752, |
| "learning_rate": 6.688517912854183e-06, |
| "loss": 0.4066, |
| "num_tokens": 498551639.0, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.09317724961570296, |
| "grad_norm": 0.19449126720428467, |
| "learning_rate": 6.665471658915793e-06, |
| "loss": 0.3974, |
| "num_tokens": 499182979.0, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.09329549485633203, |
| "grad_norm": 0.20465314388275146, |
| "learning_rate": 6.642812891814178e-06, |
| "loss": 0.3752, |
| "num_tokens": 499817574.0, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.0934137400969611, |
| "grad_norm": 0.20144398510456085, |
| "learning_rate": 6.620541944950941e-06, |
| "loss": 0.4221, |
| "num_tokens": 500450987.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.09353198533759016, |
| "grad_norm": 0.20738765597343445, |
| "learning_rate": 6.598659146021286e-06, |
| "loss": 0.4083, |
| "num_tokens": 501086255.0, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.09365023057821922, |
| "grad_norm": 0.18912115693092346, |
| "learning_rate": 6.577164817009207e-06, |
| "loss": 0.375, |
| "num_tokens": 501724060.0, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.09376847581884828, |
| "grad_norm": 0.17531508207321167, |
| "learning_rate": 6.556059274182744e-06, |
| "loss": 0.3698, |
| "num_tokens": 502336426.0, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.09388672105947736, |
| "grad_norm": 0.2189079225063324, |
| "learning_rate": 6.535342828089317e-06, |
| "loss": 0.4016, |
| "num_tokens": 502970977.0, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.09400496630010642, |
| "grad_norm": 0.20233234763145447, |
| "learning_rate": 6.515015783551183e-06, |
| "loss": 0.332, |
| "num_tokens": 503604914.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.09412321154073548, |
| "grad_norm": 0.19407616555690765, |
| "learning_rate": 6.495078439660918e-06, |
| "loss": 0.3673, |
| "num_tokens": 504241729.0, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.09424145678136454, |
| "grad_norm": 0.19280032813549042, |
| "learning_rate": 6.475531089777052e-06, |
| "loss": 0.3671, |
| "num_tokens": 504874311.0, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.09435970202199362, |
| "grad_norm": 0.1831720918416977, |
| "learning_rate": 6.456374021519726e-06, |
| "loss": 0.3887, |
| "num_tokens": 505507864.0, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.09447794726262268, |
| "grad_norm": 0.19168098270893097, |
| "learning_rate": 6.4376075167664654e-06, |
| "loss": 0.3912, |
| "num_tokens": 506141369.0, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.09459619250325174, |
| "grad_norm": 0.18384189903736115, |
| "learning_rate": 6.419231851648044e-06, |
| "loss": 0.3676, |
| "num_tokens": 506777001.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0947144377438808, |
| "grad_norm": 0.16950486600399017, |
| "learning_rate": 6.401247296544408e-06, |
| "loss": 0.3298, |
| "num_tokens": 507406711.0, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.09483268298450988, |
| "grad_norm": 0.1815839558839798, |
| "learning_rate": 6.383654116080699e-06, |
| "loss": 0.3838, |
| "num_tokens": 508042267.0, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.09495092822513894, |
| "grad_norm": 0.18775033950805664, |
| "learning_rate": 6.366452569123366e-06, |
| "loss": 0.3549, |
| "num_tokens": 508675609.0, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.095069173465768, |
| "grad_norm": 0.17260177433490753, |
| "learning_rate": 6.3496429087763535e-06, |
| "loss": 0.3564, |
| "num_tokens": 509312065.0, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.09518741870639706, |
| "grad_norm": 0.1801680028438568, |
| "learning_rate": 6.333225382377383e-06, |
| "loss": 0.3679, |
| "num_tokens": 509946717.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.09530566394702614, |
| "grad_norm": 0.1752161681652069, |
| "learning_rate": 6.3172002314943e-06, |
| "loss": 0.3705, |
| "num_tokens": 510583518.0, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.0954239091876552, |
| "grad_norm": 0.19134798645973206, |
| "learning_rate": 6.30156769192153e-06, |
| "loss": 0.3984, |
| "num_tokens": 511221107.0, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.09554215442828426, |
| "grad_norm": 0.1874755322933197, |
| "learning_rate": 6.286327993676615e-06, |
| "loss": 0.3846, |
| "num_tokens": 511860697.0, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.09566039966891332, |
| "grad_norm": 0.17865346372127533, |
| "learning_rate": 6.271481360996808e-06, |
| "loss": 0.3737, |
| "num_tokens": 512498128.0, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.0957786449095424, |
| "grad_norm": 0.19846026599407196, |
| "learning_rate": 6.257028012335795e-06, |
| "loss": 0.4089, |
| "num_tokens": 513128610.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.09589689015017146, |
| "grad_norm": 0.16993194818496704, |
| "learning_rate": 6.2429681603604726e-06, |
| "loss": 0.3392, |
| "num_tokens": 513765105.0, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.09601513539080052, |
| "grad_norm": 0.17878930270671844, |
| "learning_rate": 6.229302011947814e-06, |
| "loss": 0.3964, |
| "num_tokens": 514394034.0, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.09613338063142958, |
| "grad_norm": 0.18822607398033142, |
| "learning_rate": 6.2160297681818316e-06, |
| "loss": 0.3763, |
| "num_tokens": 515033384.0, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.09625162587205865, |
| "grad_norm": 0.19209401309490204, |
| "learning_rate": 6.2031516243506175e-06, |
| "loss": 0.3585, |
| "num_tokens": 515667789.0, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.09636987111268772, |
| "grad_norm": 0.19187025725841522, |
| "learning_rate": 6.190667769943463e-06, |
| "loss": 0.3625, |
| "num_tokens": 516301878.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.09648811635331678, |
| "grad_norm": 0.17314016819000244, |
| "learning_rate": 6.178578388648084e-06, |
| "loss": 0.3548, |
| "num_tokens": 516936923.0, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.09660636159394584, |
| "grad_norm": 0.19279181957244873, |
| "learning_rate": 6.166883658347904e-06, |
| "loss": 0.4, |
| "num_tokens": 517574893.0, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.09672460683457491, |
| "grad_norm": 0.1702749878168106, |
| "learning_rate": 6.155583751119448e-06, |
| "loss": 0.3694, |
| "num_tokens": 518213624.0, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.09684285207520398, |
| "grad_norm": 0.1792595386505127, |
| "learning_rate": 6.1446788332298e-06, |
| "loss": 0.3538, |
| "num_tokens": 518852531.0, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.09696109731583304, |
| "grad_norm": 0.18162083625793457, |
| "learning_rate": 6.134169065134162e-06, |
| "loss": 0.3896, |
| "num_tokens": 519492204.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.0970793425564621, |
| "grad_norm": 0.18663813173770905, |
| "learning_rate": 6.124054601473502e-06, |
| "loss": 0.3965, |
| "num_tokens": 520130296.0, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.09719758779709117, |
| "grad_norm": 0.1922474354505539, |
| "learning_rate": 6.114335591072261e-06, |
| "loss": 0.3621, |
| "num_tokens": 520765986.0, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.09731583303772023, |
| "grad_norm": 0.236387699842453, |
| "learning_rate": 6.105012176936177e-06, |
| "loss": 0.4225, |
| "num_tokens": 521400644.0, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.0974340782783493, |
| "grad_norm": 0.17774070799350739, |
| "learning_rate": 6.096084496250168e-06, |
| "loss": 0.364, |
| "num_tokens": 522039463.0, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.09755232351897836, |
| "grad_norm": 0.18863226473331451, |
| "learning_rate": 6.087552680376332e-06, |
| "loss": 0.3668, |
| "num_tokens": 522671508.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.09767056875960743, |
| "grad_norm": 0.19288307428359985, |
| "learning_rate": 6.079416854851993e-06, |
| "loss": 0.3596, |
| "num_tokens": 523311225.0, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.0977888140002365, |
| "grad_norm": 0.1993461400270462, |
| "learning_rate": 6.071677139387874e-06, |
| "loss": 0.3414, |
| "num_tokens": 523949133.0, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.09790705924086555, |
| "grad_norm": 0.18140719830989838, |
| "learning_rate": 6.064333647866317e-06, |
| "loss": 0.3793, |
| "num_tokens": 524577955.0, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.09802530448149462, |
| "grad_norm": 0.18989813327789307, |
| "learning_rate": 6.057386488339618e-06, |
| "loss": 0.3784, |
| "num_tokens": 525211514.0, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.09814354972212369, |
| "grad_norm": 0.18679462373256683, |
| "learning_rate": 6.050835763028446e-06, |
| "loss": 0.4006, |
| "num_tokens": 525848086.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.09826179496275275, |
| "grad_norm": 0.17804615199565887, |
| "learning_rate": 6.04468156832031e-06, |
| "loss": 0.3619, |
| "num_tokens": 526481722.0, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.09838004020338181, |
| "grad_norm": 0.1832081377506256, |
| "learning_rate": 6.038923994768173e-06, |
| "loss": 0.3818, |
| "num_tokens": 527117956.0, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.09849828544401087, |
| "grad_norm": 0.20609410107135773, |
| "learning_rate": 6.033563127089097e-06, |
| "loss": 0.4023, |
| "num_tokens": 527750234.0, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.09861653068463995, |
| "grad_norm": 0.201175257563591, |
| "learning_rate": 6.02859904416301e-06, |
| "loss": 0.3745, |
| "num_tokens": 528386561.0, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.09873477592526901, |
| "grad_norm": 0.20368118584156036, |
| "learning_rate": 6.024031819031541e-06, |
| "loss": 0.4117, |
| "num_tokens": 529021750.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.09885302116589807, |
| "grad_norm": 0.18870466947555542, |
| "learning_rate": 6.019861518896941e-06, |
| "loss": 0.3533, |
| "num_tokens": 529661276.0, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.09897126640652713, |
| "grad_norm": 0.2020527869462967, |
| "learning_rate": 6.016088205121099e-06, |
| "loss": 0.3947, |
| "num_tokens": 530297609.0, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.09908951164715621, |
| "grad_norm": 0.18172025680541992, |
| "learning_rate": 6.012711933224636e-06, |
| "loss": 0.3672, |
| "num_tokens": 530933315.0, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.09920775688778527, |
| "grad_norm": 0.1858338862657547, |
| "learning_rate": 6.009732752886096e-06, |
| "loss": 0.381, |
| "num_tokens": 531564788.0, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.09932600212841433, |
| "grad_norm": 0.18906207382678986, |
| "learning_rate": 6.0071507079412e-06, |
| "loss": 0.384, |
| "num_tokens": 532193430.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.09944424736904339, |
| "grad_norm": 0.1974787414073944, |
| "learning_rate": 6.004965836382215e-06, |
| "loss": 0.3912, |
| "num_tokens": 532828601.0, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.09956249260967245, |
| "grad_norm": 0.18472707271575928, |
| "learning_rate": 6.003178170357397e-06, |
| "loss": 0.3508, |
| "num_tokens": 533466099.0, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.09968073785030153, |
| "grad_norm": 0.17779730260372162, |
| "learning_rate": 6.001787736170496e-06, |
| "loss": 0.3865, |
| "num_tokens": 534102611.0, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.09979898309093059, |
| "grad_norm": 0.19053196907043457, |
| "learning_rate": 6.000794554280395e-06, |
| "loss": 0.3733, |
| "num_tokens": 534731488.0, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.09991722833155965, |
| "grad_norm": 0.1939527839422226, |
| "learning_rate": 6.0001986393007945e-06, |
| "loss": 0.3785, |
| "num_tokens": 535370116.0, |
| "step": 845 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 845, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 845, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.527336137257124e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|