swesmith-1.7b-535M / trainer_state.json
ricdomolm's picture
Upload folder using huggingface_hub
e43a1c7 verified
Raw
History Blame Contribute Delete
176 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.09991722833155965,
"eval_steps": 500,
"global_step": 845,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00011824524062906468,
"grad_norm": 1.7523608207702637,
"learning_rate": 0.0,
"loss": 0.9062,
"num_tokens": 628048.0,
"step": 1
},
{
"epoch": 0.00023649048125812936,
"grad_norm": 1.6680941581726074,
"learning_rate": 2.307692307692308e-06,
"loss": 0.8713,
"num_tokens": 1266689.0,
"step": 2
},
{
"epoch": 0.000354735721887194,
"grad_norm": 1.625159502029419,
"learning_rate": 4.615384615384616e-06,
"loss": 0.873,
"num_tokens": 1900338.0,
"step": 3
},
{
"epoch": 0.0004729809625162587,
"grad_norm": 1.3783349990844727,
"learning_rate": 6.923076923076923e-06,
"loss": 0.8628,
"num_tokens": 2539095.0,
"step": 4
},
{
"epoch": 0.0005912262031453234,
"grad_norm": 1.1593743562698364,
"learning_rate": 9.230769230769232e-06,
"loss": 0.8533,
"num_tokens": 3172740.0,
"step": 5
},
{
"epoch": 0.000709471443774388,
"grad_norm": 1.1372928619384766,
"learning_rate": 1.153846153846154e-05,
"loss": 0.8015,
"num_tokens": 3810090.0,
"step": 6
},
{
"epoch": 0.0008277166844034528,
"grad_norm": 1.4683241844177246,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.8613,
"num_tokens": 4447913.0,
"step": 7
},
{
"epoch": 0.0009459619250325174,
"grad_norm": 1.2824925184249878,
"learning_rate": 1.6153846153846154e-05,
"loss": 0.7549,
"num_tokens": 5085253.0,
"step": 8
},
{
"epoch": 0.001064207165661582,
"grad_norm": 0.9277980923652649,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.7863,
"num_tokens": 5721930.0,
"step": 9
},
{
"epoch": 0.0011824524062906468,
"grad_norm": 1.0162955522537231,
"learning_rate": 2.076923076923077e-05,
"loss": 0.755,
"num_tokens": 6354428.0,
"step": 10
},
{
"epoch": 0.0013006976469197116,
"grad_norm": 0.8888687491416931,
"learning_rate": 2.307692307692308e-05,
"loss": 0.7388,
"num_tokens": 6988898.0,
"step": 11
},
{
"epoch": 0.001418942887548776,
"grad_norm": 0.722545862197876,
"learning_rate": 2.5384615384615386e-05,
"loss": 0.7017,
"num_tokens": 7627058.0,
"step": 12
},
{
"epoch": 0.0015371881281778408,
"grad_norm": 0.7729371786117554,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.6836,
"num_tokens": 8261730.0,
"step": 13
},
{
"epoch": 0.0016554333688069056,
"grad_norm": 0.6504688858985901,
"learning_rate": 3e-05,
"loss": 0.6866,
"num_tokens": 8889249.0,
"step": 14
},
{
"epoch": 0.0017736786094359701,
"grad_norm": 0.6326490640640259,
"learning_rate": 3.230769230769231e-05,
"loss": 0.6229,
"num_tokens": 9524710.0,
"step": 15
},
{
"epoch": 0.0018919238500650349,
"grad_norm": 0.6311523914337158,
"learning_rate": 3.461538461538461e-05,
"loss": 0.6436,
"num_tokens": 10161723.0,
"step": 16
},
{
"epoch": 0.0020101690906940994,
"grad_norm": 2.1880621910095215,
"learning_rate": 3.692307692307693e-05,
"loss": 0.6368,
"num_tokens": 10793622.0,
"step": 17
},
{
"epoch": 0.002128414331323164,
"grad_norm": 0.766629695892334,
"learning_rate": 3.923076923076923e-05,
"loss": 0.64,
"num_tokens": 11420435.0,
"step": 18
},
{
"epoch": 0.002246659571952229,
"grad_norm": 0.6217833161354065,
"learning_rate": 4.153846153846154e-05,
"loss": 0.6326,
"num_tokens": 12055174.0,
"step": 19
},
{
"epoch": 0.0023649048125812936,
"grad_norm": 0.5231024026870728,
"learning_rate": 4.384615384615385e-05,
"loss": 0.5653,
"num_tokens": 12687424.0,
"step": 20
},
{
"epoch": 0.0024831500532103584,
"grad_norm": 0.5611249804496765,
"learning_rate": 4.615384615384616e-05,
"loss": 0.606,
"num_tokens": 13318953.0,
"step": 21
},
{
"epoch": 0.002601395293839423,
"grad_norm": 0.5640860199928284,
"learning_rate": 4.846153846153846e-05,
"loss": 0.5724,
"num_tokens": 13937259.0,
"step": 22
},
{
"epoch": 0.0027196405344684874,
"grad_norm": 0.48454275727272034,
"learning_rate": 5.076923076923077e-05,
"loss": 0.5994,
"num_tokens": 14569053.0,
"step": 23
},
{
"epoch": 0.002837885775097552,
"grad_norm": 0.6201558113098145,
"learning_rate": 5.3076923076923076e-05,
"loss": 0.6035,
"num_tokens": 15199901.0,
"step": 24
},
{
"epoch": 0.002956131015726617,
"grad_norm": 0.773175060749054,
"learning_rate": 5.538461538461539e-05,
"loss": 0.5975,
"num_tokens": 15830276.0,
"step": 25
},
{
"epoch": 0.0030743762563556817,
"grad_norm": 0.5781369209289551,
"learning_rate": 5.76923076923077e-05,
"loss": 0.5864,
"num_tokens": 16465223.0,
"step": 26
},
{
"epoch": 0.0031926214969847464,
"grad_norm": 0.5451337695121765,
"learning_rate": 6e-05,
"loss": 0.5692,
"num_tokens": 17102309.0,
"step": 27
},
{
"epoch": 0.003310866737613811,
"grad_norm": 0.48841163516044617,
"learning_rate": 5.9999801360699206e-05,
"loss": 0.5736,
"num_tokens": 17738787.0,
"step": 28
},
{
"epoch": 0.003429111978242876,
"grad_norm": 0.43556851148605347,
"learning_rate": 5.9999205445719606e-05,
"loss": 0.5085,
"num_tokens": 18370503.0,
"step": 29
},
{
"epoch": 0.0035473572188719402,
"grad_norm": 0.4315873682498932,
"learning_rate": 5.999821226382951e-05,
"loss": 0.5342,
"num_tokens": 19008700.0,
"step": 30
},
{
"epoch": 0.003665602459501005,
"grad_norm": 0.48499879240989685,
"learning_rate": 5.99968218296426e-05,
"loss": 0.5414,
"num_tokens": 19641076.0,
"step": 31
},
{
"epoch": 0.0037838477001300697,
"grad_norm": 0.3707197308540344,
"learning_rate": 5.999503416361778e-05,
"loss": 0.4694,
"num_tokens": 20268470.0,
"step": 32
},
{
"epoch": 0.0039020929407591345,
"grad_norm": 0.4602040946483612,
"learning_rate": 5.99928492920588e-05,
"loss": 0.545,
"num_tokens": 20903791.0,
"step": 33
},
{
"epoch": 0.004020338181388199,
"grad_norm": 0.4377839267253876,
"learning_rate": 5.999026724711391e-05,
"loss": 0.5273,
"num_tokens": 21537889.0,
"step": 34
},
{
"epoch": 0.0041385834220172635,
"grad_norm": 0.3558352291584015,
"learning_rate": 5.998728806677537e-05,
"loss": 0.4575,
"num_tokens": 22169163.0,
"step": 35
},
{
"epoch": 0.004256828662646328,
"grad_norm": 0.4064357280731201,
"learning_rate": 5.99839117948789e-05,
"loss": 0.5139,
"num_tokens": 22802993.0,
"step": 36
},
{
"epoch": 0.004375073903275393,
"grad_norm": 0.40676349401474,
"learning_rate": 5.998013848110306e-05,
"loss": 0.4923,
"num_tokens": 23436910.0,
"step": 37
},
{
"epoch": 0.004493319143904458,
"grad_norm": 0.4407147765159607,
"learning_rate": 5.997596818096846e-05,
"loss": 0.5295,
"num_tokens": 24066237.0,
"step": 38
},
{
"epoch": 0.0046115643845335225,
"grad_norm": 0.42400693893432617,
"learning_rate": 5.997140095583699e-05,
"loss": 0.4883,
"num_tokens": 24702070.0,
"step": 39
},
{
"epoch": 0.004729809625162587,
"grad_norm": 0.36618033051490784,
"learning_rate": 5.99664368729109e-05,
"loss": 0.4745,
"num_tokens": 25335554.0,
"step": 40
},
{
"epoch": 0.004848054865791652,
"grad_norm": 0.32382911443710327,
"learning_rate": 5.996107600523183e-05,
"loss": 0.4362,
"num_tokens": 25967347.0,
"step": 41
},
{
"epoch": 0.004966300106420717,
"grad_norm": 0.39235326647758484,
"learning_rate": 5.995531843167969e-05,
"loss": 0.4558,
"num_tokens": 26599914.0,
"step": 42
},
{
"epoch": 0.0050845453470497815,
"grad_norm": 0.3560352921485901,
"learning_rate": 5.9949164236971555e-05,
"loss": 0.5103,
"num_tokens": 27231412.0,
"step": 43
},
{
"epoch": 0.005202790587678846,
"grad_norm": 0.4623904526233673,
"learning_rate": 5.994261351166038e-05,
"loss": 0.5168,
"num_tokens": 27868949.0,
"step": 44
},
{
"epoch": 0.005321035828307911,
"grad_norm": 0.3559505343437195,
"learning_rate": 5.99356663521337e-05,
"loss": 0.4952,
"num_tokens": 28505673.0,
"step": 45
},
{
"epoch": 0.005439281068936975,
"grad_norm": 0.4067099690437317,
"learning_rate": 5.9928322860612126e-05,
"loss": 0.5023,
"num_tokens": 29143956.0,
"step": 46
},
{
"epoch": 0.00555752630956604,
"grad_norm": 0.40066882967948914,
"learning_rate": 5.992058314514801e-05,
"loss": 0.5023,
"num_tokens": 29780377.0,
"step": 47
},
{
"epoch": 0.005675771550195104,
"grad_norm": 0.3772350251674652,
"learning_rate": 5.9912447319623676e-05,
"loss": 0.5043,
"num_tokens": 30419988.0,
"step": 48
},
{
"epoch": 0.005794016790824169,
"grad_norm": 0.36861610412597656,
"learning_rate": 5.9903915503749835e-05,
"loss": 0.4962,
"num_tokens": 31058641.0,
"step": 49
},
{
"epoch": 0.005912262031453234,
"grad_norm": 0.36637431383132935,
"learning_rate": 5.989498782306382e-05,
"loss": 0.4995,
"num_tokens": 31696113.0,
"step": 50
},
{
"epoch": 0.006030507272082299,
"grad_norm": 0.432847797870636,
"learning_rate": 5.9885664408927744e-05,
"loss": 0.5389,
"num_tokens": 32335026.0,
"step": 51
},
{
"epoch": 0.006148752512711363,
"grad_norm": 0.37941452860832214,
"learning_rate": 5.98759453985265e-05,
"loss": 0.4635,
"num_tokens": 32968147.0,
"step": 52
},
{
"epoch": 0.006266997753340428,
"grad_norm": 0.4316914677619934,
"learning_rate": 5.9865830934865846e-05,
"loss": 0.5001,
"num_tokens": 33599687.0,
"step": 53
},
{
"epoch": 0.006385242993969493,
"grad_norm": 0.36900049448013306,
"learning_rate": 5.98553211667702e-05,
"loss": 0.4727,
"num_tokens": 34235952.0,
"step": 54
},
{
"epoch": 0.006503488234598558,
"grad_norm": 0.39060965180397034,
"learning_rate": 5.9844416248880556e-05,
"loss": 0.4985,
"num_tokens": 34855614.0,
"step": 55
},
{
"epoch": 0.006621733475227622,
"grad_norm": 0.4554467499256134,
"learning_rate": 5.983311634165209e-05,
"loss": 0.5408,
"num_tokens": 35490773.0,
"step": 56
},
{
"epoch": 0.006739978715856687,
"grad_norm": 0.3941882252693176,
"learning_rate": 5.982142161135191e-05,
"loss": 0.5216,
"num_tokens": 36118336.0,
"step": 57
},
{
"epoch": 0.006858223956485752,
"grad_norm": 0.39384809136390686,
"learning_rate": 5.9809332230056545e-05,
"loss": 0.4929,
"num_tokens": 36751911.0,
"step": 58
},
{
"epoch": 0.0069764691971148166,
"grad_norm": 0.38750314712524414,
"learning_rate": 5.979684837564939e-05,
"loss": 0.4889,
"num_tokens": 37375413.0,
"step": 59
},
{
"epoch": 0.0070947144377438804,
"grad_norm": 0.387765496969223,
"learning_rate": 5.978397023181817e-05,
"loss": 0.4611,
"num_tokens": 38006888.0,
"step": 60
},
{
"epoch": 0.007212959678372945,
"grad_norm": 0.36486542224884033,
"learning_rate": 5.977069798805219e-05,
"loss": 0.4789,
"num_tokens": 38640497.0,
"step": 61
},
{
"epoch": 0.00733120491900201,
"grad_norm": 0.32066309452056885,
"learning_rate": 5.975703183963953e-05,
"loss": 0.4666,
"num_tokens": 39273313.0,
"step": 62
},
{
"epoch": 0.007449450159631075,
"grad_norm": 0.4485960304737091,
"learning_rate": 5.97429719876642e-05,
"loss": 0.5988,
"num_tokens": 39910620.0,
"step": 63
},
{
"epoch": 0.007567695400260139,
"grad_norm": 0.45505833625793457,
"learning_rate": 5.97285186390032e-05,
"loss": 0.4779,
"num_tokens": 40546868.0,
"step": 64
},
{
"epoch": 0.007685940640889204,
"grad_norm": 0.3256620168685913,
"learning_rate": 5.9713672006323386e-05,
"loss": 0.4478,
"num_tokens": 41182518.0,
"step": 65
},
{
"epoch": 0.007804185881518269,
"grad_norm": 0.4429851770401001,
"learning_rate": 5.969843230807847e-05,
"loss": 0.4945,
"num_tokens": 41817083.0,
"step": 66
},
{
"epoch": 0.007922431122147333,
"grad_norm": 0.4284612536430359,
"learning_rate": 5.96827997685057e-05,
"loss": 0.4906,
"num_tokens": 42454047.0,
"step": 67
},
{
"epoch": 0.008040676362776398,
"grad_norm": 0.41103261709213257,
"learning_rate": 5.966677461762262e-05,
"loss": 0.4699,
"num_tokens": 43066339.0,
"step": 68
},
{
"epoch": 0.008158921603405462,
"grad_norm": 0.41535407304763794,
"learning_rate": 5.965035709122364e-05,
"loss": 0.516,
"num_tokens": 43703254.0,
"step": 69
},
{
"epoch": 0.008277166844034527,
"grad_norm": 0.41889598965644836,
"learning_rate": 5.963354743087664e-05,
"loss": 0.5109,
"num_tokens": 44339105.0,
"step": 70
},
{
"epoch": 0.008395412084663592,
"grad_norm": 0.37383216619491577,
"learning_rate": 5.9616345883919304e-05,
"loss": 0.4497,
"num_tokens": 44969251.0,
"step": 71
},
{
"epoch": 0.008513657325292657,
"grad_norm": 0.39339086413383484,
"learning_rate": 5.9598752703455596e-05,
"loss": 0.4967,
"num_tokens": 45605957.0,
"step": 72
},
{
"epoch": 0.008631902565921721,
"grad_norm": 0.34155574440956116,
"learning_rate": 5.958076814835196e-05,
"loss": 0.4478,
"num_tokens": 46242216.0,
"step": 73
},
{
"epoch": 0.008750147806550786,
"grad_norm": 0.40994498133659363,
"learning_rate": 5.956239248323354e-05,
"loss": 0.4974,
"num_tokens": 46879736.0,
"step": 74
},
{
"epoch": 0.00886839304717985,
"grad_norm": 0.34924760460853577,
"learning_rate": 5.9543625978480276e-05,
"loss": 0.4551,
"num_tokens": 47508876.0,
"step": 75
},
{
"epoch": 0.008986638287808916,
"grad_norm": 0.3592020869255066,
"learning_rate": 5.952446891022294e-05,
"loss": 0.4589,
"num_tokens": 48148110.0,
"step": 76
},
{
"epoch": 0.00910488352843798,
"grad_norm": 0.3335554003715515,
"learning_rate": 5.9504921560339085e-05,
"loss": 0.4415,
"num_tokens": 48779111.0,
"step": 77
},
{
"epoch": 0.009223128769067045,
"grad_norm": 0.33642348647117615,
"learning_rate": 5.948498421644883e-05,
"loss": 0.4479,
"num_tokens": 49414520.0,
"step": 78
},
{
"epoch": 0.00934137400969611,
"grad_norm": 0.3461398184299469,
"learning_rate": 5.9464657171910686e-05,
"loss": 0.4697,
"num_tokens": 50047364.0,
"step": 79
},
{
"epoch": 0.009459619250325174,
"grad_norm": 0.35207217931747437,
"learning_rate": 5.944394072581726e-05,
"loss": 0.4365,
"num_tokens": 50679909.0,
"step": 80
},
{
"epoch": 0.00957786449095424,
"grad_norm": 0.3526061177253723,
"learning_rate": 5.9422835182990794e-05,
"loss": 0.447,
"num_tokens": 51313449.0,
"step": 81
},
{
"epoch": 0.009696109731583304,
"grad_norm": 0.3391474783420563,
"learning_rate": 5.940134085397872e-05,
"loss": 0.4642,
"num_tokens": 51949695.0,
"step": 82
},
{
"epoch": 0.009814354972212369,
"grad_norm": 0.3502749800682068,
"learning_rate": 5.937945805504906e-05,
"loss": 0.4723,
"num_tokens": 52582348.0,
"step": 83
},
{
"epoch": 0.009932600212841433,
"grad_norm": 0.35535070300102234,
"learning_rate": 5.9357187108185826e-05,
"loss": 0.4752,
"num_tokens": 53211571.0,
"step": 84
},
{
"epoch": 0.010050845453470498,
"grad_norm": 0.3651171922683716,
"learning_rate": 5.933452834108421e-05,
"loss": 0.4694,
"num_tokens": 53846179.0,
"step": 85
},
{
"epoch": 0.010169090694099563,
"grad_norm": 0.32200363278388977,
"learning_rate": 5.931148208714582e-05,
"loss": 0.4597,
"num_tokens": 54478244.0,
"step": 86
},
{
"epoch": 0.010287335934728628,
"grad_norm": 0.35413858294487,
"learning_rate": 5.9288048685473756e-05,
"loss": 0.4795,
"num_tokens": 55113336.0,
"step": 87
},
{
"epoch": 0.010405581175357692,
"grad_norm": 0.28715524077415466,
"learning_rate": 5.92642284808676e-05,
"loss": 0.4432,
"num_tokens": 55750901.0,
"step": 88
},
{
"epoch": 0.010523826415986757,
"grad_norm": 0.3725243806838989,
"learning_rate": 5.924002182381839e-05,
"loss": 0.5214,
"num_tokens": 56387320.0,
"step": 89
},
{
"epoch": 0.010642071656615822,
"grad_norm": 0.3085726499557495,
"learning_rate": 5.9215429070503406e-05,
"loss": 0.4465,
"num_tokens": 57023503.0,
"step": 90
},
{
"epoch": 0.010760316897244887,
"grad_norm": 0.3731476366519928,
"learning_rate": 5.9190450582780974e-05,
"loss": 0.5066,
"num_tokens": 57651196.0,
"step": 91
},
{
"epoch": 0.01087856213787395,
"grad_norm": 0.32896849513053894,
"learning_rate": 5.9165086728185106e-05,
"loss": 0.4651,
"num_tokens": 58290170.0,
"step": 92
},
{
"epoch": 0.010996807378503014,
"grad_norm": 0.29874181747436523,
"learning_rate": 5.913933787992013e-05,
"loss": 0.4323,
"num_tokens": 58929585.0,
"step": 93
},
{
"epoch": 0.01111505261913208,
"grad_norm": 0.3025204539299011,
"learning_rate": 5.9113204416855196e-05,
"loss": 0.4362,
"num_tokens": 59569034.0,
"step": 94
},
{
"epoch": 0.011233297859761144,
"grad_norm": 0.3040831685066223,
"learning_rate": 5.908668672351862e-05,
"loss": 0.4681,
"num_tokens": 60197509.0,
"step": 95
},
{
"epoch": 0.011351543100390209,
"grad_norm": 0.33227190375328064,
"learning_rate": 5.9059785190092366e-05,
"loss": 0.4445,
"num_tokens": 60830564.0,
"step": 96
},
{
"epoch": 0.011469788341019273,
"grad_norm": 0.3173273503780365,
"learning_rate": 5.9032500212406184e-05,
"loss": 0.4706,
"num_tokens": 61466570.0,
"step": 97
},
{
"epoch": 0.011588033581648338,
"grad_norm": 0.3499050438404083,
"learning_rate": 5.900483219193184e-05,
"loss": 0.4474,
"num_tokens": 62100797.0,
"step": 98
},
{
"epoch": 0.011706278822277403,
"grad_norm": 0.29081398248672485,
"learning_rate": 5.8976781535777215e-05,
"loss": 0.4548,
"num_tokens": 62734274.0,
"step": 99
},
{
"epoch": 0.011824524062906468,
"grad_norm": 0.3550204634666443,
"learning_rate": 5.894834865668028e-05,
"loss": 0.4781,
"num_tokens": 63350637.0,
"step": 100
},
{
"epoch": 0.011942769303535532,
"grad_norm": 0.3122808635234833,
"learning_rate": 5.891953397300305e-05,
"loss": 0.4562,
"num_tokens": 63989248.0,
"step": 101
},
{
"epoch": 0.012061014544164597,
"grad_norm": 0.3456708490848541,
"learning_rate": 5.889033790872542e-05,
"loss": 0.4657,
"num_tokens": 64623402.0,
"step": 102
},
{
"epoch": 0.012179259784793662,
"grad_norm": 0.30247852206230164,
"learning_rate": 5.886076089343895e-05,
"loss": 0.42,
"num_tokens": 65263084.0,
"step": 103
},
{
"epoch": 0.012297505025422727,
"grad_norm": 0.34775105118751526,
"learning_rate": 5.883080336234049e-05,
"loss": 0.4833,
"num_tokens": 65895544.0,
"step": 104
},
{
"epoch": 0.012415750266051791,
"grad_norm": 0.35499584674835205,
"learning_rate": 5.88004657562258e-05,
"loss": 0.4397,
"num_tokens": 66526292.0,
"step": 105
},
{
"epoch": 0.012533995506680856,
"grad_norm": 0.29378530383110046,
"learning_rate": 5.876974852148312e-05,
"loss": 0.455,
"num_tokens": 67163008.0,
"step": 106
},
{
"epoch": 0.012652240747309921,
"grad_norm": 0.32384178042411804,
"learning_rate": 5.873865211008652e-05,
"loss": 0.45,
"num_tokens": 67799173.0,
"step": 107
},
{
"epoch": 0.012770485987938986,
"grad_norm": 0.3031487762928009,
"learning_rate": 5.870717697958928e-05,
"loss": 0.431,
"num_tokens": 68433626.0,
"step": 108
},
{
"epoch": 0.01288873122856805,
"grad_norm": 0.3422238230705261,
"learning_rate": 5.867532359311718e-05,
"loss": 0.462,
"num_tokens": 69071597.0,
"step": 109
},
{
"epoch": 0.013006976469197115,
"grad_norm": 0.36208781599998474,
"learning_rate": 5.864309241936167e-05,
"loss": 0.4841,
"num_tokens": 69708272.0,
"step": 110
},
{
"epoch": 0.01312522170982618,
"grad_norm": 0.35731053352355957,
"learning_rate": 5.861048393257293e-05,
"loss": 0.4707,
"num_tokens": 70309426.0,
"step": 111
},
{
"epoch": 0.013243466950455245,
"grad_norm": 0.42830735445022583,
"learning_rate": 5.8577498612552985e-05,
"loss": 0.4905,
"num_tokens": 70946347.0,
"step": 112
},
{
"epoch": 0.01336171219108431,
"grad_norm": 0.33078286051750183,
"learning_rate": 5.8544136944648554e-05,
"loss": 0.4294,
"num_tokens": 71578069.0,
"step": 113
},
{
"epoch": 0.013479957431713374,
"grad_norm": 0.31700757145881653,
"learning_rate": 5.851039941974397e-05,
"loss": 0.4321,
"num_tokens": 72216733.0,
"step": 114
},
{
"epoch": 0.013598202672342439,
"grad_norm": 0.3752131462097168,
"learning_rate": 5.8476286534253925e-05,
"loss": 0.4585,
"num_tokens": 72844928.0,
"step": 115
},
{
"epoch": 0.013716447912971504,
"grad_norm": 0.29715201258659363,
"learning_rate": 5.844179879011618e-05,
"loss": 0.4574,
"num_tokens": 73482837.0,
"step": 116
},
{
"epoch": 0.013834693153600568,
"grad_norm": 0.30245885252952576,
"learning_rate": 5.8406936694784165e-05,
"loss": 0.4828,
"num_tokens": 74118196.0,
"step": 117
},
{
"epoch": 0.013952938394229633,
"grad_norm": 0.29638686776161194,
"learning_rate": 5.8371700761219527e-05,
"loss": 0.4263,
"num_tokens": 74756174.0,
"step": 118
},
{
"epoch": 0.014071183634858696,
"grad_norm": 0.3421514928340912,
"learning_rate": 5.833609150788458e-05,
"loss": 0.4882,
"num_tokens": 75393367.0,
"step": 119
},
{
"epoch": 0.014189428875487761,
"grad_norm": 0.3114563226699829,
"learning_rate": 5.830010945873467e-05,
"loss": 0.4346,
"num_tokens": 76025875.0,
"step": 120
},
{
"epoch": 0.014307674116116826,
"grad_norm": 0.29460081458091736,
"learning_rate": 5.826375514321047e-05,
"loss": 0.4155,
"num_tokens": 76657710.0,
"step": 121
},
{
"epoch": 0.01442591935674589,
"grad_norm": 0.34313178062438965,
"learning_rate": 5.8227029096230196e-05,
"loss": 0.4563,
"num_tokens": 77289318.0,
"step": 122
},
{
"epoch": 0.014544164597374955,
"grad_norm": 0.3677009046077728,
"learning_rate": 5.81899318581817e-05,
"loss": 0.4417,
"num_tokens": 77923482.0,
"step": 123
},
{
"epoch": 0.01466240983800402,
"grad_norm": 0.3275640606880188,
"learning_rate": 5.8152463974914595e-05,
"loss": 0.4607,
"num_tokens": 78551959.0,
"step": 124
},
{
"epoch": 0.014780655078633085,
"grad_norm": 0.37022823095321655,
"learning_rate": 5.811462599773214e-05,
"loss": 0.4506,
"num_tokens": 79181459.0,
"step": 125
},
{
"epoch": 0.01489890031926215,
"grad_norm": 0.31386008858680725,
"learning_rate": 5.807641848338316e-05,
"loss": 0.4194,
"num_tokens": 79816398.0,
"step": 126
},
{
"epoch": 0.015017145559891214,
"grad_norm": 0.3229714632034302,
"learning_rate": 5.80378419940539e-05,
"loss": 0.4543,
"num_tokens": 80451198.0,
"step": 127
},
{
"epoch": 0.015135390800520279,
"grad_norm": 0.33021923899650574,
"learning_rate": 5.799889709735966e-05,
"loss": 0.4601,
"num_tokens": 81087693.0,
"step": 128
},
{
"epoch": 0.015253636041149344,
"grad_norm": 0.2887071371078491,
"learning_rate": 5.7959584366336535e-05,
"loss": 0.4132,
"num_tokens": 81722590.0,
"step": 129
},
{
"epoch": 0.015371881281778408,
"grad_norm": 0.36038845777511597,
"learning_rate": 5.7919904379432913e-05,
"loss": 0.5152,
"num_tokens": 82358511.0,
"step": 130
},
{
"epoch": 0.015490126522407473,
"grad_norm": 0.32132768630981445,
"learning_rate": 5.787985772050101e-05,
"loss": 0.4346,
"num_tokens": 82997292.0,
"step": 131
},
{
"epoch": 0.015608371763036538,
"grad_norm": 0.2985667884349823,
"learning_rate": 5.783944497878826e-05,
"loss": 0.4244,
"num_tokens": 83636002.0,
"step": 132
},
{
"epoch": 0.0157266170036656,
"grad_norm": 0.30603644251823425,
"learning_rate": 5.7798666748928636e-05,
"loss": 0.4487,
"num_tokens": 84266256.0,
"step": 133
},
{
"epoch": 0.015844862244294666,
"grad_norm": 0.3081704378128052,
"learning_rate": 5.775752363093394e-05,
"loss": 0.4649,
"num_tokens": 84901903.0,
"step": 134
},
{
"epoch": 0.01596310748492373,
"grad_norm": 0.29722145199775696,
"learning_rate": 5.7716016230184895e-05,
"loss": 0.4351,
"num_tokens": 85532297.0,
"step": 135
},
{
"epoch": 0.016081352725552795,
"grad_norm": 0.27900344133377075,
"learning_rate": 5.767414515742235e-05,
"loss": 0.3898,
"num_tokens": 86159004.0,
"step": 136
},
{
"epoch": 0.01619959796618186,
"grad_norm": 0.2939743995666504,
"learning_rate": 5.7631911028738184e-05,
"loss": 0.4395,
"num_tokens": 86791668.0,
"step": 137
},
{
"epoch": 0.016317843206810925,
"grad_norm": 0.3190593421459198,
"learning_rate": 5.7589314465566326e-05,
"loss": 0.4502,
"num_tokens": 87415500.0,
"step": 138
},
{
"epoch": 0.01643608844743999,
"grad_norm": 0.29683569073677063,
"learning_rate": 5.7546356094673545e-05,
"loss": 0.4181,
"num_tokens": 88054250.0,
"step": 139
},
{
"epoch": 0.016554333688069054,
"grad_norm": 0.295808345079422,
"learning_rate": 5.750303654815026e-05,
"loss": 0.4011,
"num_tokens": 88683640.0,
"step": 140
},
{
"epoch": 0.01667257892869812,
"grad_norm": 0.33235254883766174,
"learning_rate": 5.745935646340125e-05,
"loss": 0.4017,
"num_tokens": 89322994.0,
"step": 141
},
{
"epoch": 0.016790824169327184,
"grad_norm": 0.30324289202690125,
"learning_rate": 5.7415316483136266e-05,
"loss": 0.4486,
"num_tokens": 89959870.0,
"step": 142
},
{
"epoch": 0.01690906940995625,
"grad_norm": 0.3970086872577667,
"learning_rate": 5.737091725536055e-05,
"loss": 0.4515,
"num_tokens": 90595155.0,
"step": 143
},
{
"epoch": 0.017027314650585313,
"grad_norm": 0.27713295817375183,
"learning_rate": 5.732615943336531e-05,
"loss": 0.4523,
"num_tokens": 91229434.0,
"step": 144
},
{
"epoch": 0.017145559891214378,
"grad_norm": 0.31949537992477417,
"learning_rate": 5.7281043675718176e-05,
"loss": 0.423,
"num_tokens": 91864729.0,
"step": 145
},
{
"epoch": 0.017263805131843443,
"grad_norm": 0.2788122892379761,
"learning_rate": 5.7235570646253385e-05,
"loss": 0.4037,
"num_tokens": 92497696.0,
"step": 146
},
{
"epoch": 0.017382050372472507,
"grad_norm": 0.33565449714660645,
"learning_rate": 5.71897410140621e-05,
"loss": 0.4794,
"num_tokens": 93136961.0,
"step": 147
},
{
"epoch": 0.017500295613101572,
"grad_norm": 0.3093065619468689,
"learning_rate": 5.7143555453482564e-05,
"loss": 0.46,
"num_tokens": 93763389.0,
"step": 148
},
{
"epoch": 0.017618540853730637,
"grad_norm": 0.28062355518341064,
"learning_rate": 5.709701464409014e-05,
"loss": 0.4594,
"num_tokens": 94396681.0,
"step": 149
},
{
"epoch": 0.0177367860943597,
"grad_norm": 0.29357820749282837,
"learning_rate": 5.705011927068734e-05,
"loss": 0.4611,
"num_tokens": 95024975.0,
"step": 150
},
{
"epoch": 0.017855031334988766,
"grad_norm": 0.37621134519577026,
"learning_rate": 5.700287002329374e-05,
"loss": 0.4681,
"num_tokens": 95647926.0,
"step": 151
},
{
"epoch": 0.01797327657561783,
"grad_norm": 0.3109932541847229,
"learning_rate": 5.6955267597135795e-05,
"loss": 0.4347,
"num_tokens": 96284873.0,
"step": 152
},
{
"epoch": 0.018091521816246896,
"grad_norm": 0.33683377504348755,
"learning_rate": 5.6907312692636665e-05,
"loss": 0.4484,
"num_tokens": 96921347.0,
"step": 153
},
{
"epoch": 0.01820976705687596,
"grad_norm": 0.29445359110832214,
"learning_rate": 5.6859006015405905e-05,
"loss": 0.3997,
"num_tokens": 97555490.0,
"step": 154
},
{
"epoch": 0.018328012297505025,
"grad_norm": 0.32711490988731384,
"learning_rate": 5.681034827622904e-05,
"loss": 0.4153,
"num_tokens": 98193055.0,
"step": 155
},
{
"epoch": 0.01844625753813409,
"grad_norm": 0.29570791125297546,
"learning_rate": 5.67613401910571e-05,
"loss": 0.3944,
"num_tokens": 98826897.0,
"step": 156
},
{
"epoch": 0.018564502778763155,
"grad_norm": 0.3205905854701996,
"learning_rate": 5.671198248099617e-05,
"loss": 0.4673,
"num_tokens": 99462013.0,
"step": 157
},
{
"epoch": 0.01868274801939222,
"grad_norm": 0.29417866468429565,
"learning_rate": 5.666227587229669e-05,
"loss": 0.4771,
"num_tokens": 100097628.0,
"step": 158
},
{
"epoch": 0.018800993260021284,
"grad_norm": 0.2989625036716461,
"learning_rate": 5.66122210963428e-05,
"loss": 0.4152,
"num_tokens": 100734556.0,
"step": 159
},
{
"epoch": 0.01891923850065035,
"grad_norm": 0.3053020238876343,
"learning_rate": 5.656181888964159e-05,
"loss": 0.4606,
"num_tokens": 101371427.0,
"step": 160
},
{
"epoch": 0.019037483741279414,
"grad_norm": 0.2914108633995056,
"learning_rate": 5.6511069993812255e-05,
"loss": 0.4647,
"num_tokens": 102008014.0,
"step": 161
},
{
"epoch": 0.01915572898190848,
"grad_norm": 0.31419283151626587,
"learning_rate": 5.645997515557518e-05,
"loss": 0.4277,
"num_tokens": 102647195.0,
"step": 162
},
{
"epoch": 0.019273974222537543,
"grad_norm": 0.25925683975219727,
"learning_rate": 5.640853512674095e-05,
"loss": 0.4409,
"num_tokens": 103272117.0,
"step": 163
},
{
"epoch": 0.019392219463166608,
"grad_norm": 0.29054054617881775,
"learning_rate": 5.63567506641993e-05,
"loss": 0.4468,
"num_tokens": 103911617.0,
"step": 164
},
{
"epoch": 0.019510464703795673,
"grad_norm": 0.2996600270271301,
"learning_rate": 5.630462252990796e-05,
"loss": 0.4583,
"num_tokens": 104546025.0,
"step": 165
},
{
"epoch": 0.019628709944424737,
"grad_norm": 0.26758819818496704,
"learning_rate": 5.6252151490881474e-05,
"loss": 0.4193,
"num_tokens": 105181492.0,
"step": 166
},
{
"epoch": 0.019746955185053802,
"grad_norm": 0.28083500266075134,
"learning_rate": 5.6199338319179856e-05,
"loss": 0.4166,
"num_tokens": 105818707.0,
"step": 167
},
{
"epoch": 0.019865200425682867,
"grad_norm": 0.2543669641017914,
"learning_rate": 5.614618379189731e-05,
"loss": 0.3928,
"num_tokens": 106447672.0,
"step": 168
},
{
"epoch": 0.01998344566631193,
"grad_norm": 0.29574477672576904,
"learning_rate": 5.609268869115072e-05,
"loss": 0.4303,
"num_tokens": 107079280.0,
"step": 169
},
{
"epoch": 0.020101690906940996,
"grad_norm": 0.2757669985294342,
"learning_rate": 5.6038853804068205e-05,
"loss": 0.4325,
"num_tokens": 107716692.0,
"step": 170
},
{
"epoch": 0.02021993614757006,
"grad_norm": 0.3341258764266968,
"learning_rate": 5.598467992277748e-05,
"loss": 0.4302,
"num_tokens": 108346190.0,
"step": 171
},
{
"epoch": 0.020338181388199126,
"grad_norm": 0.2687680423259735,
"learning_rate": 5.5930167844394255e-05,
"loss": 0.4188,
"num_tokens": 108972655.0,
"step": 172
},
{
"epoch": 0.02045642662882819,
"grad_norm": 0.3229896128177643,
"learning_rate": 5.587531837101046e-05,
"loss": 0.4436,
"num_tokens": 109606533.0,
"step": 173
},
{
"epoch": 0.020574671869457255,
"grad_norm": 0.2820740044116974,
"learning_rate": 5.582013230968246e-05,
"loss": 0.4294,
"num_tokens": 110242667.0,
"step": 174
},
{
"epoch": 0.02069291711008632,
"grad_norm": 0.35922062397003174,
"learning_rate": 5.5764610472419194e-05,
"loss": 0.4835,
"num_tokens": 110879342.0,
"step": 175
},
{
"epoch": 0.020811162350715385,
"grad_norm": 0.2997070550918579,
"learning_rate": 5.5708753676170236e-05,
"loss": 0.4347,
"num_tokens": 111515578.0,
"step": 176
},
{
"epoch": 0.02092940759134445,
"grad_norm": 0.2995700240135193,
"learning_rate": 5.565256274281369e-05,
"loss": 0.395,
"num_tokens": 112148074.0,
"step": 177
},
{
"epoch": 0.021047652831973514,
"grad_norm": 0.319938600063324,
"learning_rate": 5.5596038499144235e-05,
"loss": 0.4813,
"num_tokens": 112784825.0,
"step": 178
},
{
"epoch": 0.02116589807260258,
"grad_norm": 0.338448166847229,
"learning_rate": 5.5539181776860835e-05,
"loss": 0.457,
"num_tokens": 113415511.0,
"step": 179
},
{
"epoch": 0.021284143313231644,
"grad_norm": 0.318758487701416,
"learning_rate": 5.548199341255457e-05,
"loss": 0.4566,
"num_tokens": 114014233.0,
"step": 180
},
{
"epoch": 0.02140238855386071,
"grad_norm": 0.322611927986145,
"learning_rate": 5.542447424769632e-05,
"loss": 0.4384,
"num_tokens": 114646091.0,
"step": 181
},
{
"epoch": 0.021520633794489773,
"grad_norm": 0.3043844699859619,
"learning_rate": 5.536662512862434e-05,
"loss": 0.4125,
"num_tokens": 115248849.0,
"step": 182
},
{
"epoch": 0.021638879035118838,
"grad_norm": 0.30535179376602173,
"learning_rate": 5.530844690653187e-05,
"loss": 0.4083,
"num_tokens": 115882858.0,
"step": 183
},
{
"epoch": 0.0217571242757479,
"grad_norm": 0.29622629284858704,
"learning_rate": 5.524994043745455e-05,
"loss": 0.4424,
"num_tokens": 116516321.0,
"step": 184
},
{
"epoch": 0.021875369516376964,
"grad_norm": 0.3178810775279999,
"learning_rate": 5.519110658225789e-05,
"loss": 0.4187,
"num_tokens": 117149980.0,
"step": 185
},
{
"epoch": 0.02199361475700603,
"grad_norm": 0.2797812819480896,
"learning_rate": 5.513194620662453e-05,
"loss": 0.4033,
"num_tokens": 117787055.0,
"step": 186
},
{
"epoch": 0.022111859997635094,
"grad_norm": 0.34582096338272095,
"learning_rate": 5.5072460181041565e-05,
"loss": 0.4264,
"num_tokens": 118414231.0,
"step": 187
},
{
"epoch": 0.02223010523826416,
"grad_norm": 0.3072027266025543,
"learning_rate": 5.5012649380787697e-05,
"loss": 0.4425,
"num_tokens": 119042723.0,
"step": 188
},
{
"epoch": 0.022348350478893223,
"grad_norm": 0.27631890773773193,
"learning_rate": 5.495251468592038e-05,
"loss": 0.47,
"num_tokens": 119680244.0,
"step": 189
},
{
"epoch": 0.022466595719522288,
"grad_norm": 0.39626777172088623,
"learning_rate": 5.489205698126284e-05,
"loss": 0.4255,
"num_tokens": 120319137.0,
"step": 190
},
{
"epoch": 0.022584840960151353,
"grad_norm": 0.28610390424728394,
"learning_rate": 5.483127715639111e-05,
"loss": 0.4364,
"num_tokens": 120954282.0,
"step": 191
},
{
"epoch": 0.022703086200780417,
"grad_norm": 0.4135710597038269,
"learning_rate": 5.477017610562086e-05,
"loss": 0.4342,
"num_tokens": 121589180.0,
"step": 192
},
{
"epoch": 0.022821331441409482,
"grad_norm": 0.3279666304588318,
"learning_rate": 5.4708754727994347e-05,
"loss": 0.4693,
"num_tokens": 122226045.0,
"step": 193
},
{
"epoch": 0.022939576682038547,
"grad_norm": 0.3193162679672241,
"learning_rate": 5.4647013927267055e-05,
"loss": 0.411,
"num_tokens": 122863565.0,
"step": 194
},
{
"epoch": 0.02305782192266761,
"grad_norm": 0.3436163067817688,
"learning_rate": 5.4584954611894535e-05,
"loss": 0.4065,
"num_tokens": 123498631.0,
"step": 195
},
{
"epoch": 0.023176067163296676,
"grad_norm": 0.3175835907459259,
"learning_rate": 5.452257769501891e-05,
"loss": 0.4343,
"num_tokens": 124134670.0,
"step": 196
},
{
"epoch": 0.02329431240392574,
"grad_norm": 0.276996374130249,
"learning_rate": 5.445988409445553e-05,
"loss": 0.4125,
"num_tokens": 124770499.0,
"step": 197
},
{
"epoch": 0.023412557644554806,
"grad_norm": 0.3553844690322876,
"learning_rate": 5.4396874732679444e-05,
"loss": 0.4659,
"num_tokens": 125409234.0,
"step": 198
},
{
"epoch": 0.02353080288518387,
"grad_norm": 0.26136353611946106,
"learning_rate": 5.433355053681179e-05,
"loss": 0.4354,
"num_tokens": 126041885.0,
"step": 199
},
{
"epoch": 0.023649048125812935,
"grad_norm": 0.3091793656349182,
"learning_rate": 5.42699124386062e-05,
"loss": 0.4539,
"num_tokens": 126679673.0,
"step": 200
},
{
"epoch": 0.023767293366442,
"grad_norm": 0.3038508892059326,
"learning_rate": 5.420596137443508e-05,
"loss": 0.4468,
"num_tokens": 127318553.0,
"step": 201
},
{
"epoch": 0.023885538607071065,
"grad_norm": 0.257994145154953,
"learning_rate": 5.41416982852758e-05,
"loss": 0.4177,
"num_tokens": 127957565.0,
"step": 202
},
{
"epoch": 0.02400378384770013,
"grad_norm": 0.3154793381690979,
"learning_rate": 5.4077124116696884e-05,
"loss": 0.4944,
"num_tokens": 128588826.0,
"step": 203
},
{
"epoch": 0.024122029088329194,
"grad_norm": 0.30118247866630554,
"learning_rate": 5.401223981884411e-05,
"loss": 0.4431,
"num_tokens": 129222173.0,
"step": 204
},
{
"epoch": 0.02424027432895826,
"grad_norm": 0.26696497201919556,
"learning_rate": 5.3947046346426456e-05,
"loss": 0.4586,
"num_tokens": 129857385.0,
"step": 205
},
{
"epoch": 0.024358519569587324,
"grad_norm": 0.25432252883911133,
"learning_rate": 5.3881544658702133e-05,
"loss": 0.3814,
"num_tokens": 130486516.0,
"step": 206
},
{
"epoch": 0.02447676481021639,
"grad_norm": 0.27828487753868103,
"learning_rate": 5.381573571946445e-05,
"loss": 0.4529,
"num_tokens": 131117306.0,
"step": 207
},
{
"epoch": 0.024595010050845453,
"grad_norm": 0.29483503103256226,
"learning_rate": 5.374962049702759e-05,
"loss": 0.4738,
"num_tokens": 131749433.0,
"step": 208
},
{
"epoch": 0.024713255291474518,
"grad_norm": 0.2637292742729187,
"learning_rate": 5.3683199964212405e-05,
"loss": 0.4242,
"num_tokens": 132382579.0,
"step": 209
},
{
"epoch": 0.024831500532103583,
"grad_norm": 0.2828076183795929,
"learning_rate": 5.3616475098332105e-05,
"loss": 0.4374,
"num_tokens": 133017061.0,
"step": 210
},
{
"epoch": 0.024949745772732648,
"grad_norm": 0.27759385108947754,
"learning_rate": 5.3549446881177853e-05,
"loss": 0.4296,
"num_tokens": 133645920.0,
"step": 211
},
{
"epoch": 0.025067991013361712,
"grad_norm": 0.26630890369415283,
"learning_rate": 5.3482116299004336e-05,
"loss": 0.468,
"num_tokens": 134277976.0,
"step": 212
},
{
"epoch": 0.025186236253990777,
"grad_norm": 0.24754807353019714,
"learning_rate": 5.341448434251522e-05,
"loss": 0.4468,
"num_tokens": 134913386.0,
"step": 213
},
{
"epoch": 0.025304481494619842,
"grad_norm": 0.27732178568840027,
"learning_rate": 5.334655200684864e-05,
"loss": 0.4323,
"num_tokens": 135544399.0,
"step": 214
},
{
"epoch": 0.025422726735248907,
"grad_norm": 0.30716535449028015,
"learning_rate": 5.327832029156247e-05,
"loss": 0.441,
"num_tokens": 136182707.0,
"step": 215
},
{
"epoch": 0.02554097197587797,
"grad_norm": 0.26287323236465454,
"learning_rate": 5.3209790200619726e-05,
"loss": 0.436,
"num_tokens": 136819793.0,
"step": 216
},
{
"epoch": 0.025659217216507036,
"grad_norm": 0.28410691022872925,
"learning_rate": 5.314096274237367e-05,
"loss": 0.4414,
"num_tokens": 137459203.0,
"step": 217
},
{
"epoch": 0.0257774624571361,
"grad_norm": 0.27251100540161133,
"learning_rate": 5.3071838929553065e-05,
"loss": 0.4345,
"num_tokens": 138086108.0,
"step": 218
},
{
"epoch": 0.025895707697765166,
"grad_norm": 0.24234391748905182,
"learning_rate": 5.300241977924722e-05,
"loss": 0.4244,
"num_tokens": 138717361.0,
"step": 219
},
{
"epoch": 0.02601395293839423,
"grad_norm": 0.31852856278419495,
"learning_rate": 5.293270631289107e-05,
"loss": 0.408,
"num_tokens": 139353768.0,
"step": 220
},
{
"epoch": 0.026132198179023295,
"grad_norm": 0.29865893721580505,
"learning_rate": 5.286269955625011e-05,
"loss": 0.4701,
"num_tokens": 139986012.0,
"step": 221
},
{
"epoch": 0.02625044341965236,
"grad_norm": 0.27321770787239075,
"learning_rate": 5.279240053940531e-05,
"loss": 0.4059,
"num_tokens": 140618557.0,
"step": 222
},
{
"epoch": 0.026368688660281425,
"grad_norm": 0.29831984639167786,
"learning_rate": 5.2721810296737984e-05,
"loss": 0.3978,
"num_tokens": 141253328.0,
"step": 223
},
{
"epoch": 0.02648693390091049,
"grad_norm": 0.3366415798664093,
"learning_rate": 5.265092986691453e-05,
"loss": 0.4354,
"num_tokens": 141885327.0,
"step": 224
},
{
"epoch": 0.026605179141539554,
"grad_norm": 0.27974531054496765,
"learning_rate": 5.257976029287117e-05,
"loss": 0.4497,
"num_tokens": 142518760.0,
"step": 225
},
{
"epoch": 0.02672342438216862,
"grad_norm": 0.31790000200271606,
"learning_rate": 5.250830262179859e-05,
"loss": 0.4561,
"num_tokens": 143154500.0,
"step": 226
},
{
"epoch": 0.026841669622797684,
"grad_norm": 0.2560494840145111,
"learning_rate": 5.243655790512659e-05,
"loss": 0.4402,
"num_tokens": 143792062.0,
"step": 227
},
{
"epoch": 0.026959914863426748,
"grad_norm": 0.27620622515678406,
"learning_rate": 5.236452719850849e-05,
"loss": 0.3912,
"num_tokens": 144424810.0,
"step": 228
},
{
"epoch": 0.027078160104055813,
"grad_norm": 0.2581166625022888,
"learning_rate": 5.2292211561805726e-05,
"loss": 0.4145,
"num_tokens": 145021445.0,
"step": 229
},
{
"epoch": 0.027196405344684878,
"grad_norm": 0.297852486371994,
"learning_rate": 5.2219612059072196e-05,
"loss": 0.4675,
"num_tokens": 145656556.0,
"step": 230
},
{
"epoch": 0.027314650585313942,
"grad_norm": 0.286258339881897,
"learning_rate": 5.214672975853859e-05,
"loss": 0.413,
"num_tokens": 146293020.0,
"step": 231
},
{
"epoch": 0.027432895825943007,
"grad_norm": 0.2543971538543701,
"learning_rate": 5.207356573259671e-05,
"loss": 0.4335,
"num_tokens": 146922200.0,
"step": 232
},
{
"epoch": 0.027551141066572072,
"grad_norm": 0.29354169964790344,
"learning_rate": 5.2000121057783674e-05,
"loss": 0.4786,
"num_tokens": 147560483.0,
"step": 233
},
{
"epoch": 0.027669386307201137,
"grad_norm": 0.22866986691951752,
"learning_rate": 5.1926396814766034e-05,
"loss": 0.4198,
"num_tokens": 148198475.0,
"step": 234
},
{
"epoch": 0.0277876315478302,
"grad_norm": 0.2605131268501282,
"learning_rate": 5.185239408832397e-05,
"loss": 0.4363,
"num_tokens": 148811827.0,
"step": 235
},
{
"epoch": 0.027905876788459266,
"grad_norm": 0.22731252014636993,
"learning_rate": 5.177811396733523e-05,
"loss": 0.4034,
"num_tokens": 149446588.0,
"step": 236
},
{
"epoch": 0.028024122029088328,
"grad_norm": 0.23291230201721191,
"learning_rate": 5.170355754475919e-05,
"loss": 0.3862,
"num_tokens": 150080880.0,
"step": 237
},
{
"epoch": 0.028142367269717392,
"grad_norm": 0.2324601113796234,
"learning_rate": 5.162872591762069e-05,
"loss": 0.4557,
"num_tokens": 150720517.0,
"step": 238
},
{
"epoch": 0.028260612510346457,
"grad_norm": 0.2646247148513794,
"learning_rate": 5.155362018699396e-05,
"loss": 0.4241,
"num_tokens": 151354865.0,
"step": 239
},
{
"epoch": 0.028378857750975522,
"grad_norm": 0.2472905069589615,
"learning_rate": 5.147824145798643e-05,
"loss": 0.3896,
"num_tokens": 151989302.0,
"step": 240
},
{
"epoch": 0.028497102991604586,
"grad_norm": 0.27559757232666016,
"learning_rate": 5.1402590839722356e-05,
"loss": 0.4254,
"num_tokens": 152622644.0,
"step": 241
},
{
"epoch": 0.02861534823223365,
"grad_norm": 0.2577532231807709,
"learning_rate": 5.132666944532664e-05,
"loss": 0.4598,
"num_tokens": 153254978.0,
"step": 242
},
{
"epoch": 0.028733593472862716,
"grad_norm": 0.26456958055496216,
"learning_rate": 5.125047839190837e-05,
"loss": 0.4006,
"num_tokens": 153888439.0,
"step": 243
},
{
"epoch": 0.02885183871349178,
"grad_norm": 0.23455888032913208,
"learning_rate": 5.1174018800544395e-05,
"loss": 0.377,
"num_tokens": 154521234.0,
"step": 244
},
{
"epoch": 0.028970083954120845,
"grad_norm": 0.2502966821193695,
"learning_rate": 5.1097291796262854e-05,
"loss": 0.4257,
"num_tokens": 155156546.0,
"step": 245
},
{
"epoch": 0.02908832919474991,
"grad_norm": 0.26023155450820923,
"learning_rate": 5.10202985080266e-05,
"loss": 0.4268,
"num_tokens": 155791974.0,
"step": 246
},
{
"epoch": 0.029206574435378975,
"grad_norm": 0.23456987738609314,
"learning_rate": 5.0943040068716584e-05,
"loss": 0.385,
"num_tokens": 156417737.0,
"step": 247
},
{
"epoch": 0.02932481967600804,
"grad_norm": 0.23901493847370148,
"learning_rate": 5.086551761511521e-05,
"loss": 0.4553,
"num_tokens": 157054324.0,
"step": 248
},
{
"epoch": 0.029443064916637104,
"grad_norm": 0.26856529712677,
"learning_rate": 5.0787732287889574e-05,
"loss": 0.4435,
"num_tokens": 157686875.0,
"step": 249
},
{
"epoch": 0.02956131015726617,
"grad_norm": 0.28496497869491577,
"learning_rate": 5.070968523157474e-05,
"loss": 0.4373,
"num_tokens": 158318798.0,
"step": 250
},
{
"epoch": 0.029679555397895234,
"grad_norm": 0.2572629451751709,
"learning_rate": 5.0631377594556795e-05,
"loss": 0.451,
"num_tokens": 158956587.0,
"step": 251
},
{
"epoch": 0.0297978006385243,
"grad_norm": 0.25811442732810974,
"learning_rate": 5.05528105290561e-05,
"loss": 0.3716,
"num_tokens": 159591859.0,
"step": 252
},
{
"epoch": 0.029916045879153363,
"grad_norm": 0.266215980052948,
"learning_rate": 5.047398519111017e-05,
"loss": 0.4106,
"num_tokens": 160224798.0,
"step": 253
},
{
"epoch": 0.030034291119782428,
"grad_norm": 0.2669126093387604,
"learning_rate": 5.0394902740556806e-05,
"loss": 0.4158,
"num_tokens": 160855622.0,
"step": 254
},
{
"epoch": 0.030152536360411493,
"grad_norm": 0.27752405405044556,
"learning_rate": 5.031556434101694e-05,
"loss": 0.3848,
"num_tokens": 161489536.0,
"step": 255
},
{
"epoch": 0.030270781601040558,
"grad_norm": 0.2833244502544403,
"learning_rate": 5.023597115987755e-05,
"loss": 0.4691,
"num_tokens": 162123541.0,
"step": 256
},
{
"epoch": 0.030389026841669622,
"grad_norm": 0.23394179344177246,
"learning_rate": 5.0156124368274474e-05,
"loss": 0.4263,
"num_tokens": 162754194.0,
"step": 257
},
{
"epoch": 0.030507272082298687,
"grad_norm": 0.2544839680194855,
"learning_rate": 5.007602514107518e-05,
"loss": 0.4125,
"num_tokens": 163389806.0,
"step": 258
},
{
"epoch": 0.030625517322927752,
"grad_norm": 0.23980510234832764,
"learning_rate": 4.99956746568615e-05,
"loss": 0.4008,
"num_tokens": 164022489.0,
"step": 259
},
{
"epoch": 0.030743762563556817,
"grad_norm": 0.2352251559495926,
"learning_rate": 4.991507409791223e-05,
"loss": 0.4105,
"num_tokens": 164655264.0,
"step": 260
},
{
"epoch": 0.03086200780418588,
"grad_norm": 0.23630301654338837,
"learning_rate": 4.983422465018581e-05,
"loss": 0.4021,
"num_tokens": 165293663.0,
"step": 261
},
{
"epoch": 0.030980253044814946,
"grad_norm": 0.25700318813323975,
"learning_rate": 4.975312750330279e-05,
"loss": 0.4257,
"num_tokens": 165930344.0,
"step": 262
},
{
"epoch": 0.03109849828544401,
"grad_norm": 0.22052328288555145,
"learning_rate": 4.967178385052841e-05,
"loss": 0.3837,
"num_tokens": 166565332.0,
"step": 263
},
{
"epoch": 0.031216743526073076,
"grad_norm": 0.25492745637893677,
"learning_rate": 4.959019488875499e-05,
"loss": 0.4271,
"num_tokens": 167200224.0,
"step": 264
},
{
"epoch": 0.03133498876670214,
"grad_norm": 0.23811548948287964,
"learning_rate": 4.9508361818484334e-05,
"loss": 0.4191,
"num_tokens": 167832144.0,
"step": 265
},
{
"epoch": 0.0314532340073312,
"grad_norm": 0.22468101978302002,
"learning_rate": 4.9426285843810045e-05,
"loss": 0.3999,
"num_tokens": 168469994.0,
"step": 266
},
{
"epoch": 0.03157147924796027,
"grad_norm": 0.2614614963531494,
"learning_rate": 4.934396817239986e-05,
"loss": 0.4342,
"num_tokens": 169106098.0,
"step": 267
},
{
"epoch": 0.03168972448858933,
"grad_norm": 0.22566261887550354,
"learning_rate": 4.926141001547783e-05,
"loss": 0.3746,
"num_tokens": 169737993.0,
"step": 268
},
{
"epoch": 0.0318079697292184,
"grad_norm": 0.2282998412847519,
"learning_rate": 4.91786125878065e-05,
"loss": 0.3858,
"num_tokens": 170368812.0,
"step": 269
},
{
"epoch": 0.03192621496984746,
"grad_norm": 0.25831902027130127,
"learning_rate": 4.9095577107669084e-05,
"loss": 0.4595,
"num_tokens": 171006424.0,
"step": 270
},
{
"epoch": 0.03204446021047653,
"grad_norm": 0.22491995990276337,
"learning_rate": 4.9012304796851486e-05,
"loss": 0.4136,
"num_tokens": 171645721.0,
"step": 271
},
{
"epoch": 0.03216270545110559,
"grad_norm": 0.25414589047431946,
"learning_rate": 4.892879688062432e-05,
"loss": 0.4001,
"num_tokens": 172281075.0,
"step": 272
},
{
"epoch": 0.03228095069173466,
"grad_norm": 0.22207149863243103,
"learning_rate": 4.884505458772495e-05,
"loss": 0.3639,
"num_tokens": 172914063.0,
"step": 273
},
{
"epoch": 0.03239919593236372,
"grad_norm": 0.23464854061603546,
"learning_rate": 4.876107915033933e-05,
"loss": 0.4264,
"num_tokens": 173548325.0,
"step": 274
},
{
"epoch": 0.03251744117299279,
"grad_norm": 0.26920729875564575,
"learning_rate": 4.867687180408392e-05,
"loss": 0.4248,
"num_tokens": 174183975.0,
"step": 275
},
{
"epoch": 0.03263568641362185,
"grad_norm": 0.22815345227718353,
"learning_rate": 4.859243378798748e-05,
"loss": 0.398,
"num_tokens": 174818549.0,
"step": 276
},
{
"epoch": 0.03275393165425092,
"grad_norm": 0.232111856341362,
"learning_rate": 4.850776634447287e-05,
"loss": 0.3862,
"num_tokens": 175451113.0,
"step": 277
},
{
"epoch": 0.03287217689487998,
"grad_norm": 0.27156439423561096,
"learning_rate": 4.842287071933874e-05,
"loss": 0.433,
"num_tokens": 176087116.0,
"step": 278
},
{
"epoch": 0.03299042213550905,
"grad_norm": 0.2743763029575348,
"learning_rate": 4.8337748161741207e-05,
"loss": 0.4497,
"num_tokens": 176724483.0,
"step": 279
},
{
"epoch": 0.03310866737613811,
"grad_norm": 0.26658013463020325,
"learning_rate": 4.825239992417548e-05,
"loss": 0.4255,
"num_tokens": 177361164.0,
"step": 280
},
{
"epoch": 0.033226912616767176,
"grad_norm": 0.2353833168745041,
"learning_rate": 4.8166827262457436e-05,
"loss": 0.3786,
"num_tokens": 177999098.0,
"step": 281
},
{
"epoch": 0.03334515785739624,
"grad_norm": 0.26090359687805176,
"learning_rate": 4.808103143570511e-05,
"loss": 0.4224,
"num_tokens": 178627820.0,
"step": 282
},
{
"epoch": 0.033463403098025306,
"grad_norm": 0.23582051694393158,
"learning_rate": 4.7995013706320215e-05,
"loss": 0.4088,
"num_tokens": 179259176.0,
"step": 283
},
{
"epoch": 0.03358164833865437,
"grad_norm": 0.26351359486579895,
"learning_rate": 4.790877533996955e-05,
"loss": 0.4279,
"num_tokens": 179890905.0,
"step": 284
},
{
"epoch": 0.033699893579283435,
"grad_norm": 0.25399163365364075,
"learning_rate": 4.7822317605566335e-05,
"loss": 0.4169,
"num_tokens": 180518445.0,
"step": 285
},
{
"epoch": 0.0338181388199125,
"grad_norm": 0.2980181872844696,
"learning_rate": 4.7735641775251624e-05,
"loss": 0.449,
"num_tokens": 181154667.0,
"step": 286
},
{
"epoch": 0.033936384060541565,
"grad_norm": 0.27028796076774597,
"learning_rate": 4.764874912437551e-05,
"loss": 0.4321,
"num_tokens": 181789184.0,
"step": 287
},
{
"epoch": 0.034054629301170626,
"grad_norm": 0.2491423338651657,
"learning_rate": 4.756164093147838e-05,
"loss": 0.4155,
"num_tokens": 182421462.0,
"step": 288
},
{
"epoch": 0.034172874541799694,
"grad_norm": 0.3009137213230133,
"learning_rate": 4.747431847827214e-05,
"loss": 0.4015,
"num_tokens": 183056216.0,
"step": 289
},
{
"epoch": 0.034291119782428756,
"grad_norm": 0.2448507696390152,
"learning_rate": 4.73867830496213e-05,
"loss": 0.4331,
"num_tokens": 183695586.0,
"step": 290
},
{
"epoch": 0.034409365023057824,
"grad_norm": 0.2819685935974121,
"learning_rate": 4.729903593352412e-05,
"loss": 0.4017,
"num_tokens": 184323170.0,
"step": 291
},
{
"epoch": 0.034527610263686885,
"grad_norm": 0.3014317750930786,
"learning_rate": 4.721107842109362e-05,
"loss": 0.4771,
"num_tokens": 184923402.0,
"step": 292
},
{
"epoch": 0.03464585550431595,
"grad_norm": 0.23289276659488678,
"learning_rate": 4.712291180653859e-05,
"loss": 0.4004,
"num_tokens": 185562179.0,
"step": 293
},
{
"epoch": 0.034764100744945015,
"grad_norm": 0.25156062841415405,
"learning_rate": 4.703453738714457e-05,
"loss": 0.4127,
"num_tokens": 186196488.0,
"step": 294
},
{
"epoch": 0.03488234598557408,
"grad_norm": 0.27437835931777954,
"learning_rate": 4.6945956463254733e-05,
"loss": 0.4458,
"num_tokens": 186795333.0,
"step": 295
},
{
"epoch": 0.035000591226203144,
"grad_norm": 0.2655051052570343,
"learning_rate": 4.6857170338250756e-05,
"loss": 0.3878,
"num_tokens": 187431540.0,
"step": 296
},
{
"epoch": 0.03511883646683221,
"grad_norm": 0.24947364628314972,
"learning_rate": 4.676818031853367e-05,
"loss": 0.4086,
"num_tokens": 188067882.0,
"step": 297
},
{
"epoch": 0.035237081707461274,
"grad_norm": 0.27399611473083496,
"learning_rate": 4.667898771350461e-05,
"loss": 0.4469,
"num_tokens": 188704706.0,
"step": 298
},
{
"epoch": 0.03535532694809034,
"grad_norm": 0.23381806910037994,
"learning_rate": 4.658959383554554e-05,
"loss": 0.3872,
"num_tokens": 189339944.0,
"step": 299
},
{
"epoch": 0.0354735721887194,
"grad_norm": 0.30683842301368713,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.4722,
"num_tokens": 189977157.0,
"step": 300
},
{
"epoch": 0.03559181742934847,
"grad_norm": 0.23590795695781708,
"learning_rate": 4.641020752515366e-05,
"loss": 0.4177,
"num_tokens": 190586411.0,
"step": 301
},
{
"epoch": 0.03571006266997753,
"grad_norm": 0.2523725926876068,
"learning_rate": 4.632021773221499e-05,
"loss": 0.4323,
"num_tokens": 191219345.0,
"step": 302
},
{
"epoch": 0.0358283079106066,
"grad_norm": 0.24050471186637878,
"learning_rate": 4.623003194529583e-05,
"loss": 0.4244,
"num_tokens": 191855183.0,
"step": 303
},
{
"epoch": 0.03594655315123566,
"grad_norm": 0.24300076067447662,
"learning_rate": 4.613965149139185e-05,
"loss": 0.3956,
"num_tokens": 192485493.0,
"step": 304
},
{
"epoch": 0.03606479839186473,
"grad_norm": 0.2315610945224762,
"learning_rate": 4.6049077700363056e-05,
"loss": 0.3896,
"num_tokens": 193122797.0,
"step": 305
},
{
"epoch": 0.03618304363249379,
"grad_norm": 0.25560230016708374,
"learning_rate": 4.595831190491424e-05,
"loss": 0.4167,
"num_tokens": 193759752.0,
"step": 306
},
{
"epoch": 0.03630128887312286,
"grad_norm": 0.25288936495780945,
"learning_rate": 4.586735544057531e-05,
"loss": 0.4087,
"num_tokens": 194394288.0,
"step": 307
},
{
"epoch": 0.03641953411375192,
"grad_norm": 0.2969334125518799,
"learning_rate": 4.5776209645681745e-05,
"loss": 0.4075,
"num_tokens": 195027778.0,
"step": 308
},
{
"epoch": 0.03653777935438099,
"grad_norm": 0.22655892372131348,
"learning_rate": 4.568487586135478e-05,
"loss": 0.3378,
"num_tokens": 195660013.0,
"step": 309
},
{
"epoch": 0.03665602459501005,
"grad_norm": 0.28944021463394165,
"learning_rate": 4.5593355431481754e-05,
"loss": 0.4249,
"num_tokens": 196290478.0,
"step": 310
},
{
"epoch": 0.03677426983563912,
"grad_norm": 0.23864449560642242,
"learning_rate": 4.550164970269633e-05,
"loss": 0.4412,
"num_tokens": 196927060.0,
"step": 311
},
{
"epoch": 0.03689251507626818,
"grad_norm": 0.247343510389328,
"learning_rate": 4.540976002435862e-05,
"loss": 0.4384,
"num_tokens": 197557085.0,
"step": 312
},
{
"epoch": 0.03701076031689725,
"grad_norm": 0.2885189950466156,
"learning_rate": 4.53176877485354e-05,
"loss": 0.4252,
"num_tokens": 198189535.0,
"step": 313
},
{
"epoch": 0.03712900555752631,
"grad_norm": 0.2791072428226471,
"learning_rate": 4.5225434229980215e-05,
"loss": 0.4425,
"num_tokens": 198820737.0,
"step": 314
},
{
"epoch": 0.03724725079815538,
"grad_norm": 0.2613127529621124,
"learning_rate": 4.513300082611336e-05,
"loss": 0.3994,
"num_tokens": 199451792.0,
"step": 315
},
{
"epoch": 0.03736549603878444,
"grad_norm": 0.2581581473350525,
"learning_rate": 4.504038889700201e-05,
"loss": 0.4052,
"num_tokens": 200086012.0,
"step": 316
},
{
"epoch": 0.0374837412794135,
"grad_norm": 0.25737565755844116,
"learning_rate": 4.494759980534017e-05,
"loss": 0.3975,
"num_tokens": 200723155.0,
"step": 317
},
{
"epoch": 0.03760198652004257,
"grad_norm": 0.2575814127922058,
"learning_rate": 4.4854634916428583e-05,
"loss": 0.4188,
"num_tokens": 201362056.0,
"step": 318
},
{
"epoch": 0.03772023176067163,
"grad_norm": 0.24522624909877777,
"learning_rate": 4.4761495598154706e-05,
"loss": 0.4012,
"num_tokens": 201996006.0,
"step": 319
},
{
"epoch": 0.0378384770013007,
"grad_norm": 0.2399868369102478,
"learning_rate": 4.466818322097253e-05,
"loss": 0.3726,
"num_tokens": 202591057.0,
"step": 320
},
{
"epoch": 0.03795672224192976,
"grad_norm": 0.23226316273212433,
"learning_rate": 4.4574699157882465e-05,
"loss": 0.3846,
"num_tokens": 203228812.0,
"step": 321
},
{
"epoch": 0.03807496748255883,
"grad_norm": 0.263351172208786,
"learning_rate": 4.44810447844111e-05,
"loss": 0.4168,
"num_tokens": 203868294.0,
"step": 322
},
{
"epoch": 0.03819321272318789,
"grad_norm": 0.2123018354177475,
"learning_rate": 4.438722147859095e-05,
"loss": 0.3815,
"num_tokens": 204499481.0,
"step": 323
},
{
"epoch": 0.03831145796381696,
"grad_norm": 0.2778543531894684,
"learning_rate": 4.429323062094026e-05,
"loss": 0.3969,
"num_tokens": 205133494.0,
"step": 324
},
{
"epoch": 0.03842970320444602,
"grad_norm": 0.2408173829317093,
"learning_rate": 4.419907359444259e-05,
"loss": 0.4108,
"num_tokens": 205767024.0,
"step": 325
},
{
"epoch": 0.038547948445075086,
"grad_norm": 0.26782068610191345,
"learning_rate": 4.410475178452652e-05,
"loss": 0.4291,
"num_tokens": 206400825.0,
"step": 326
},
{
"epoch": 0.03866619368570415,
"grad_norm": 0.26312699913978577,
"learning_rate": 4.4010266579045256e-05,
"loss": 0.4136,
"num_tokens": 207040239.0,
"step": 327
},
{
"epoch": 0.038784438926333216,
"grad_norm": 0.256391704082489,
"learning_rate": 4.391561936825623e-05,
"loss": 0.3959,
"num_tokens": 207676732.0,
"step": 328
},
{
"epoch": 0.03890268416696228,
"grad_norm": 0.2285778969526291,
"learning_rate": 4.3820811544800617e-05,
"loss": 0.3881,
"num_tokens": 208313021.0,
"step": 329
},
{
"epoch": 0.039020929407591345,
"grad_norm": 0.2927227318286896,
"learning_rate": 4.372584450368283e-05,
"loss": 0.4485,
"num_tokens": 208946344.0,
"step": 330
},
{
"epoch": 0.03913917464822041,
"grad_norm": 0.25876858830451965,
"learning_rate": 4.3630719642250034e-05,
"loss": 0.4692,
"num_tokens": 209577542.0,
"step": 331
},
{
"epoch": 0.039257419888849475,
"grad_norm": 0.2661622166633606,
"learning_rate": 4.3535438360171556e-05,
"loss": 0.4608,
"num_tokens": 210213046.0,
"step": 332
},
{
"epoch": 0.039375665129478536,
"grad_norm": 0.2588401436805725,
"learning_rate": 4.344000205941831e-05,
"loss": 0.4155,
"num_tokens": 210848130.0,
"step": 333
},
{
"epoch": 0.039493910370107604,
"grad_norm": 0.25796785950660706,
"learning_rate": 4.3344412144242146e-05,
"loss": 0.4037,
"num_tokens": 211482121.0,
"step": 334
},
{
"epoch": 0.039612155610736666,
"grad_norm": 0.2662915587425232,
"learning_rate": 4.3248670021155206e-05,
"loss": 0.4512,
"num_tokens": 212120668.0,
"step": 335
},
{
"epoch": 0.039730400851365734,
"grad_norm": 0.24322502315044403,
"learning_rate": 4.315277709890922e-05,
"loss": 0.4174,
"num_tokens": 212756102.0,
"step": 336
},
{
"epoch": 0.039848646091994795,
"grad_norm": 0.2540619671344757,
"learning_rate": 4.3056734788474785e-05,
"loss": 0.4436,
"num_tokens": 213392130.0,
"step": 337
},
{
"epoch": 0.03996689133262386,
"grad_norm": 0.24154382944107056,
"learning_rate": 4.29605445030206e-05,
"loss": 0.3931,
"num_tokens": 214021365.0,
"step": 338
},
{
"epoch": 0.040085136573252925,
"grad_norm": 0.24840545654296875,
"learning_rate": 4.286420765789267e-05,
"loss": 0.4088,
"num_tokens": 214651340.0,
"step": 339
},
{
"epoch": 0.04020338181388199,
"grad_norm": 0.2844981551170349,
"learning_rate": 4.276772567059347e-05,
"loss": 0.4351,
"num_tokens": 215284267.0,
"step": 340
},
{
"epoch": 0.040321627054511054,
"grad_norm": 0.2580728232860565,
"learning_rate": 4.2671099960761116e-05,
"loss": 0.4454,
"num_tokens": 215920647.0,
"step": 341
},
{
"epoch": 0.04043987229514012,
"grad_norm": 0.2855488061904907,
"learning_rate": 4.257433195014846e-05,
"loss": 0.3805,
"num_tokens": 216555518.0,
"step": 342
},
{
"epoch": 0.040558117535769184,
"grad_norm": 0.21848393976688385,
"learning_rate": 4.247742306260217e-05,
"loss": 0.3795,
"num_tokens": 217191272.0,
"step": 343
},
{
"epoch": 0.04067636277639825,
"grad_norm": 0.264885276556015,
"learning_rate": 4.238037472404176e-05,
"loss": 0.4108,
"num_tokens": 217824700.0,
"step": 344
},
{
"epoch": 0.04079460801702731,
"grad_norm": 0.2161663919687271,
"learning_rate": 4.228318836243865e-05,
"loss": 0.3729,
"num_tokens": 218455560.0,
"step": 345
},
{
"epoch": 0.04091285325765638,
"grad_norm": 0.22689329087734222,
"learning_rate": 4.218586540779515e-05,
"loss": 0.421,
"num_tokens": 219091298.0,
"step": 346
},
{
"epoch": 0.04103109849828544,
"grad_norm": 0.24377533793449402,
"learning_rate": 4.208840729212337e-05,
"loss": 0.3951,
"num_tokens": 219727733.0,
"step": 347
},
{
"epoch": 0.04114934373891451,
"grad_norm": 0.24370762705802917,
"learning_rate": 4.199081544942418e-05,
"loss": 0.4481,
"num_tokens": 220360695.0,
"step": 348
},
{
"epoch": 0.04126758897954357,
"grad_norm": 0.23610427975654602,
"learning_rate": 4.189309131566615e-05,
"loss": 0.4373,
"num_tokens": 220993405.0,
"step": 349
},
{
"epoch": 0.04138583422017264,
"grad_norm": 0.2471226155757904,
"learning_rate": 4.1795236328764354e-05,
"loss": 0.4307,
"num_tokens": 221619384.0,
"step": 350
},
{
"epoch": 0.0415040794608017,
"grad_norm": 0.2555200159549713,
"learning_rate": 4.169725192855925e-05,
"loss": 0.4149,
"num_tokens": 222250253.0,
"step": 351
},
{
"epoch": 0.04162232470143077,
"grad_norm": 0.26108643412590027,
"learning_rate": 4.159913955679548e-05,
"loss": 0.4016,
"num_tokens": 222884935.0,
"step": 352
},
{
"epoch": 0.04174056994205983,
"grad_norm": 0.22140191495418549,
"learning_rate": 4.150090065710067e-05,
"loss": 0.4025,
"num_tokens": 223516629.0,
"step": 353
},
{
"epoch": 0.0418588151826889,
"grad_norm": 0.2396477907896042,
"learning_rate": 4.1402536674964195e-05,
"loss": 0.4105,
"num_tokens": 224150031.0,
"step": 354
},
{
"epoch": 0.04197706042331796,
"grad_norm": 0.23356612026691437,
"learning_rate": 4.130404905771586e-05,
"loss": 0.3962,
"num_tokens": 224786071.0,
"step": 355
},
{
"epoch": 0.04209530566394703,
"grad_norm": 0.2547277510166168,
"learning_rate": 4.1205439254504666e-05,
"loss": 0.4314,
"num_tokens": 225421240.0,
"step": 356
},
{
"epoch": 0.04221355090457609,
"grad_norm": 0.2576862871646881,
"learning_rate": 4.110670871627745e-05,
"loss": 0.396,
"num_tokens": 226052174.0,
"step": 357
},
{
"epoch": 0.04233179614520516,
"grad_norm": 0.22883984446525574,
"learning_rate": 4.100785889575757e-05,
"loss": 0.4374,
"num_tokens": 226689398.0,
"step": 358
},
{
"epoch": 0.04245004138583422,
"grad_norm": 0.23827779293060303,
"learning_rate": 4.090889124742346e-05,
"loss": 0.4014,
"num_tokens": 227327616.0,
"step": 359
},
{
"epoch": 0.04256828662646329,
"grad_norm": 0.22566570341587067,
"learning_rate": 4.080980722748733e-05,
"loss": 0.4054,
"num_tokens": 227952686.0,
"step": 360
},
{
"epoch": 0.04268653186709235,
"grad_norm": 0.2515687644481659,
"learning_rate": 4.0710608293873634e-05,
"loss": 0.4194,
"num_tokens": 228587586.0,
"step": 361
},
{
"epoch": 0.04280477710772142,
"grad_norm": 0.2160085290670395,
"learning_rate": 4.0611295906197706e-05,
"loss": 0.4048,
"num_tokens": 229185285.0,
"step": 362
},
{
"epoch": 0.04292302234835048,
"grad_norm": 0.21602442860603333,
"learning_rate": 4.0511871525744224e-05,
"loss": 0.3995,
"num_tokens": 229815886.0,
"step": 363
},
{
"epoch": 0.04304126758897955,
"grad_norm": 0.26638063788414,
"learning_rate": 4.041233661544574e-05,
"loss": 0.4104,
"num_tokens": 230449875.0,
"step": 364
},
{
"epoch": 0.04315951282960861,
"grad_norm": 0.21101397275924683,
"learning_rate": 4.0312692639861146e-05,
"loss": 0.4125,
"num_tokens": 231087769.0,
"step": 365
},
{
"epoch": 0.043277758070237676,
"grad_norm": 0.22914250195026398,
"learning_rate": 4.021294106515411e-05,
"loss": 0.3969,
"num_tokens": 231720719.0,
"step": 366
},
{
"epoch": 0.04339600331086674,
"grad_norm": 0.21389196813106537,
"learning_rate": 4.011308335907152e-05,
"loss": 0.3922,
"num_tokens": 232354694.0,
"step": 367
},
{
"epoch": 0.0435142485514958,
"grad_norm": 0.22924332320690155,
"learning_rate": 4.00131209909219e-05,
"loss": 0.4202,
"num_tokens": 232986853.0,
"step": 368
},
{
"epoch": 0.04363249379212487,
"grad_norm": 0.2374032735824585,
"learning_rate": 3.991305543155378e-05,
"loss": 0.4575,
"num_tokens": 233626246.0,
"step": 369
},
{
"epoch": 0.04375073903275393,
"grad_norm": 0.20903757214546204,
"learning_rate": 3.981288815333399e-05,
"loss": 0.3508,
"num_tokens": 234256236.0,
"step": 370
},
{
"epoch": 0.043868984273383,
"grad_norm": 0.23430699110031128,
"learning_rate": 3.971262063012612e-05,
"loss": 0.4202,
"num_tokens": 234894656.0,
"step": 371
},
{
"epoch": 0.04398722951401206,
"grad_norm": 0.21054008603096008,
"learning_rate": 3.9612254337268734e-05,
"loss": 0.4029,
"num_tokens": 235530175.0,
"step": 372
},
{
"epoch": 0.044105474754641126,
"grad_norm": 0.22597409784793854,
"learning_rate": 3.95117907515537e-05,
"loss": 0.3881,
"num_tokens": 236165286.0,
"step": 373
},
{
"epoch": 0.04422371999527019,
"grad_norm": 0.24336762726306915,
"learning_rate": 3.941123135120445e-05,
"loss": 0.389,
"num_tokens": 236799872.0,
"step": 374
},
{
"epoch": 0.044341965235899256,
"grad_norm": 0.2279030978679657,
"learning_rate": 3.9310577615854264e-05,
"loss": 0.3643,
"num_tokens": 237436361.0,
"step": 375
},
{
"epoch": 0.04446021047652832,
"grad_norm": 0.20615456998348236,
"learning_rate": 3.920983102652443e-05,
"loss": 0.3824,
"num_tokens": 238072053.0,
"step": 376
},
{
"epoch": 0.044578455717157385,
"grad_norm": 0.22816775739192963,
"learning_rate": 3.910899306560251e-05,
"loss": 0.4291,
"num_tokens": 238707861.0,
"step": 377
},
{
"epoch": 0.044696700957786446,
"grad_norm": 0.22566092014312744,
"learning_rate": 3.9008065216820486e-05,
"loss": 0.3967,
"num_tokens": 239340071.0,
"step": 378
},
{
"epoch": 0.044814946198415515,
"grad_norm": 0.22702094912528992,
"learning_rate": 3.890704896523302e-05,
"loss": 0.4185,
"num_tokens": 239974165.0,
"step": 379
},
{
"epoch": 0.044933191439044576,
"grad_norm": 0.20416148006916046,
"learning_rate": 3.880594579719545e-05,
"loss": 0.3879,
"num_tokens": 240606077.0,
"step": 380
},
{
"epoch": 0.045051436679673644,
"grad_norm": 0.2429252415895462,
"learning_rate": 3.870475720034206e-05,
"loss": 0.4027,
"num_tokens": 241243195.0,
"step": 381
},
{
"epoch": 0.045169681920302705,
"grad_norm": 0.24931378662586212,
"learning_rate": 3.860348466356413e-05,
"loss": 0.4474,
"num_tokens": 241881692.0,
"step": 382
},
{
"epoch": 0.045287927160931774,
"grad_norm": 0.26254212856292725,
"learning_rate": 3.850212967698799e-05,
"loss": 0.4189,
"num_tokens": 242520949.0,
"step": 383
},
{
"epoch": 0.045406172401560835,
"grad_norm": 0.2300311028957367,
"learning_rate": 3.84006937319532e-05,
"loss": 0.3986,
"num_tokens": 243160575.0,
"step": 384
},
{
"epoch": 0.0455244176421899,
"grad_norm": 0.24005557596683502,
"learning_rate": 3.829917832099051e-05,
"loss": 0.4128,
"num_tokens": 243790943.0,
"step": 385
},
{
"epoch": 0.045642662882818964,
"grad_norm": 0.2699725031852722,
"learning_rate": 3.819758493779992e-05,
"loss": 0.4602,
"num_tokens": 244423844.0,
"step": 386
},
{
"epoch": 0.04576090812344803,
"grad_norm": 0.23983405530452728,
"learning_rate": 3.8095915077228754e-05,
"loss": 0.3914,
"num_tokens": 245054470.0,
"step": 387
},
{
"epoch": 0.045879153364077094,
"grad_norm": 0.2433352917432785,
"learning_rate": 3.79941702352496e-05,
"loss": 0.3811,
"num_tokens": 245688487.0,
"step": 388
},
{
"epoch": 0.04599739860470616,
"grad_norm": 0.24374330043792725,
"learning_rate": 3.7892351908938326e-05,
"loss": 0.4106,
"num_tokens": 246325682.0,
"step": 389
},
{
"epoch": 0.04611564384533522,
"grad_norm": 0.21965977549552917,
"learning_rate": 3.7790461596452057e-05,
"loss": 0.4311,
"num_tokens": 246961506.0,
"step": 390
},
{
"epoch": 0.04623388908596429,
"grad_norm": 0.23189356923103333,
"learning_rate": 3.7688500797007124e-05,
"loss": 0.3798,
"num_tokens": 247594032.0,
"step": 391
},
{
"epoch": 0.04635213432659335,
"grad_norm": 0.2253284901380539,
"learning_rate": 3.758647101085699e-05,
"loss": 0.427,
"num_tokens": 248227593.0,
"step": 392
},
{
"epoch": 0.04647037956722242,
"grad_norm": 0.2451157420873642,
"learning_rate": 3.748437373927022e-05,
"loss": 0.4083,
"num_tokens": 248859376.0,
"step": 393
},
{
"epoch": 0.04658862480785148,
"grad_norm": 0.22249139845371246,
"learning_rate": 3.738221048450834e-05,
"loss": 0.4254,
"num_tokens": 249493350.0,
"step": 394
},
{
"epoch": 0.04670687004848055,
"grad_norm": 0.24161191284656525,
"learning_rate": 3.7279982749803736e-05,
"loss": 0.3853,
"num_tokens": 250126507.0,
"step": 395
},
{
"epoch": 0.04682511528910961,
"grad_norm": 0.23410917818546295,
"learning_rate": 3.717769203933759e-05,
"loss": 0.424,
"num_tokens": 250765825.0,
"step": 396
},
{
"epoch": 0.04694336052973868,
"grad_norm": 0.23993557691574097,
"learning_rate": 3.7075339858217706e-05,
"loss": 0.4189,
"num_tokens": 251403372.0,
"step": 397
},
{
"epoch": 0.04706160577036774,
"grad_norm": 0.32063019275665283,
"learning_rate": 3.697292771245633e-05,
"loss": 0.4365,
"num_tokens": 252041762.0,
"step": 398
},
{
"epoch": 0.04717985101099681,
"grad_norm": 0.21619325876235962,
"learning_rate": 3.687045710894808e-05,
"loss": 0.4027,
"num_tokens": 252673716.0,
"step": 399
},
{
"epoch": 0.04729809625162587,
"grad_norm": 0.2204645574092865,
"learning_rate": 3.67679295554477e-05,
"loss": 0.4117,
"num_tokens": 253310805.0,
"step": 400
},
{
"epoch": 0.04741634149225494,
"grad_norm": 0.21249093115329742,
"learning_rate": 3.666534656054788e-05,
"loss": 0.3398,
"num_tokens": 253944650.0,
"step": 401
},
{
"epoch": 0.047534586732884,
"grad_norm": 0.2515881061553955,
"learning_rate": 3.65627096336571e-05,
"loss": 0.3814,
"num_tokens": 254579224.0,
"step": 402
},
{
"epoch": 0.04765283197351307,
"grad_norm": 0.22720524668693542,
"learning_rate": 3.646002028497738e-05,
"loss": 0.3981,
"num_tokens": 255212393.0,
"step": 403
},
{
"epoch": 0.04777107721414213,
"grad_norm": 0.24506784975528717,
"learning_rate": 3.63572800254821e-05,
"loss": 0.4215,
"num_tokens": 255846216.0,
"step": 404
},
{
"epoch": 0.0478893224547712,
"grad_norm": 0.25425246357917786,
"learning_rate": 3.625449036689372e-05,
"loss": 0.429,
"num_tokens": 256484541.0,
"step": 405
},
{
"epoch": 0.04800756769540026,
"grad_norm": 0.23869769275188446,
"learning_rate": 3.6151652821661576e-05,
"loss": 0.3881,
"num_tokens": 257114691.0,
"step": 406
},
{
"epoch": 0.04812581293602933,
"grad_norm": 0.2546592056751251,
"learning_rate": 3.604876890293959e-05,
"loss": 0.4059,
"num_tokens": 257748044.0,
"step": 407
},
{
"epoch": 0.04824405817665839,
"grad_norm": 0.22846068441867828,
"learning_rate": 3.594584012456403e-05,
"loss": 0.3613,
"num_tokens": 258386984.0,
"step": 408
},
{
"epoch": 0.04836230341728746,
"grad_norm": 0.24633820354938507,
"learning_rate": 3.584286800103124e-05,
"loss": 0.4298,
"num_tokens": 259023318.0,
"step": 409
},
{
"epoch": 0.04848054865791652,
"grad_norm": 0.2492648810148239,
"learning_rate": 3.573985404747535e-05,
"loss": 0.383,
"num_tokens": 259657204.0,
"step": 410
},
{
"epoch": 0.048598793898545586,
"grad_norm": 0.22464512288570404,
"learning_rate": 3.563679977964595e-05,
"loss": 0.3838,
"num_tokens": 260290556.0,
"step": 411
},
{
"epoch": 0.04871703913917465,
"grad_norm": 0.27683940529823303,
"learning_rate": 3.5533706713885844e-05,
"loss": 0.4461,
"num_tokens": 260928576.0,
"step": 412
},
{
"epoch": 0.048835284379803716,
"grad_norm": 0.20443028211593628,
"learning_rate": 3.5430576367108694e-05,
"loss": 0.3948,
"num_tokens": 261563484.0,
"step": 413
},
{
"epoch": 0.04895352962043278,
"grad_norm": 0.20911704003810883,
"learning_rate": 3.532741025677673e-05,
"loss": 0.3649,
"num_tokens": 262198058.0,
"step": 414
},
{
"epoch": 0.049071774861061845,
"grad_norm": 0.27862629294395447,
"learning_rate": 3.522420990087839e-05,
"loss": 0.4237,
"num_tokens": 262825300.0,
"step": 415
},
{
"epoch": 0.04919002010169091,
"grad_norm": 0.2638210654258728,
"learning_rate": 3.5120976817906e-05,
"loss": 0.4384,
"num_tokens": 263458362.0,
"step": 416
},
{
"epoch": 0.049308265342319975,
"grad_norm": 0.24697713553905487,
"learning_rate": 3.5017712526833454e-05,
"loss": 0.3814,
"num_tokens": 264088367.0,
"step": 417
},
{
"epoch": 0.049426510582949036,
"grad_norm": 0.2173382192850113,
"learning_rate": 3.491441854709384e-05,
"loss": 0.3949,
"num_tokens": 264724592.0,
"step": 418
},
{
"epoch": 0.049544755823578104,
"grad_norm": 0.25613975524902344,
"learning_rate": 3.481109639855707e-05,
"loss": 0.3821,
"num_tokens": 265360262.0,
"step": 419
},
{
"epoch": 0.049663001064207166,
"grad_norm": 0.24708124995231628,
"learning_rate": 3.470774760150753e-05,
"loss": 0.4341,
"num_tokens": 265997689.0,
"step": 420
},
{
"epoch": 0.04978124630483623,
"grad_norm": 0.23348525166511536,
"learning_rate": 3.460437367662173e-05,
"loss": 0.4044,
"num_tokens": 266631262.0,
"step": 421
},
{
"epoch": 0.049899491545465295,
"grad_norm": 0.23553021252155304,
"learning_rate": 3.450097614494592e-05,
"loss": 0.3966,
"num_tokens": 267268979.0,
"step": 422
},
{
"epoch": 0.050017736786094356,
"grad_norm": 0.2573988139629364,
"learning_rate": 3.439755652787366e-05,
"loss": 0.4017,
"num_tokens": 267904627.0,
"step": 423
},
{
"epoch": 0.050135982026723425,
"grad_norm": 0.22248908877372742,
"learning_rate": 3.4294116347123505e-05,
"loss": 0.357,
"num_tokens": 268543181.0,
"step": 424
},
{
"epoch": 0.050254227267352486,
"grad_norm": 0.22894316911697388,
"learning_rate": 3.419065712471659e-05,
"loss": 0.4027,
"num_tokens": 269179996.0,
"step": 425
},
{
"epoch": 0.050372472507981554,
"grad_norm": 0.25380998849868774,
"learning_rate": 3.4087180382954214e-05,
"loss": 0.3843,
"num_tokens": 269811253.0,
"step": 426
},
{
"epoch": 0.050490717748610615,
"grad_norm": 0.23106823861598969,
"learning_rate": 3.398368764439546e-05,
"loss": 0.39,
"num_tokens": 270441984.0,
"step": 427
},
{
"epoch": 0.050608962989239684,
"grad_norm": 0.22412751615047455,
"learning_rate": 3.388018043183478e-05,
"loss": 0.3997,
"num_tokens": 271074224.0,
"step": 428
},
{
"epoch": 0.050727208229868745,
"grad_norm": 0.2578945457935333,
"learning_rate": 3.377666026827962e-05,
"loss": 0.446,
"num_tokens": 271705707.0,
"step": 429
},
{
"epoch": 0.05084545347049781,
"grad_norm": 0.23338672518730164,
"learning_rate": 3.367312867692797e-05,
"loss": 0.379,
"num_tokens": 272318335.0,
"step": 430
},
{
"epoch": 0.050963698711126874,
"grad_norm": 0.209132581949234,
"learning_rate": 3.3569587181145974e-05,
"loss": 0.416,
"num_tokens": 272955472.0,
"step": 431
},
{
"epoch": 0.05108194395175594,
"grad_norm": 0.21573707461357117,
"learning_rate": 3.346603730444549e-05,
"loss": 0.4051,
"num_tokens": 273559901.0,
"step": 432
},
{
"epoch": 0.051200189192385004,
"grad_norm": 0.24565227329730988,
"learning_rate": 3.336248057046174e-05,
"loss": 0.4033,
"num_tokens": 274192648.0,
"step": 433
},
{
"epoch": 0.05131843443301407,
"grad_norm": 0.20935006439685822,
"learning_rate": 3.325891850293078e-05,
"loss": 0.373,
"num_tokens": 274831825.0,
"step": 434
},
{
"epoch": 0.05143667967364313,
"grad_norm": 0.2096380591392517,
"learning_rate": 3.315535262566722e-05,
"loss": 0.358,
"num_tokens": 275456510.0,
"step": 435
},
{
"epoch": 0.0515549249142722,
"grad_norm": 0.22659966349601746,
"learning_rate": 3.305178446254166e-05,
"loss": 0.3623,
"num_tokens": 276091876.0,
"step": 436
},
{
"epoch": 0.05167317015490126,
"grad_norm": 0.21803656220436096,
"learning_rate": 3.294821553745835e-05,
"loss": 0.4107,
"num_tokens": 276727335.0,
"step": 437
},
{
"epoch": 0.05179141539553033,
"grad_norm": 0.207914799451828,
"learning_rate": 3.284464737433279e-05,
"loss": 0.4361,
"num_tokens": 277359081.0,
"step": 438
},
{
"epoch": 0.05190966063615939,
"grad_norm": 0.20631778240203857,
"learning_rate": 3.2741081497069215e-05,
"loss": 0.3707,
"num_tokens": 277990765.0,
"step": 439
},
{
"epoch": 0.05202790587678846,
"grad_norm": 0.2180744856595993,
"learning_rate": 3.263751942953828e-05,
"loss": 0.3962,
"num_tokens": 278612675.0,
"step": 440
},
{
"epoch": 0.05214615111741752,
"grad_norm": 0.19695664942264557,
"learning_rate": 3.2533962695554515e-05,
"loss": 0.3742,
"num_tokens": 279246447.0,
"step": 441
},
{
"epoch": 0.05226439635804659,
"grad_norm": 0.2566263973712921,
"learning_rate": 3.243041281885404e-05,
"loss": 0.4438,
"num_tokens": 279880451.0,
"step": 442
},
{
"epoch": 0.05238264159867565,
"grad_norm": 0.25131720304489136,
"learning_rate": 3.232687132307204e-05,
"loss": 0.4346,
"num_tokens": 280517149.0,
"step": 443
},
{
"epoch": 0.05250088683930472,
"grad_norm": 0.21113261580467224,
"learning_rate": 3.222333973172039e-05,
"loss": 0.3724,
"num_tokens": 281144009.0,
"step": 444
},
{
"epoch": 0.05261913207993378,
"grad_norm": 0.19451619684696198,
"learning_rate": 3.211981956816523e-05,
"loss": 0.3889,
"num_tokens": 281781076.0,
"step": 445
},
{
"epoch": 0.05273737732056285,
"grad_norm": 0.20863431692123413,
"learning_rate": 3.201631235560456e-05,
"loss": 0.3956,
"num_tokens": 282418028.0,
"step": 446
},
{
"epoch": 0.05285562256119191,
"grad_norm": 0.22713254392147064,
"learning_rate": 3.1912819617045805e-05,
"loss": 0.385,
"num_tokens": 283052667.0,
"step": 447
},
{
"epoch": 0.05297386780182098,
"grad_norm": 0.22602516412734985,
"learning_rate": 3.180934287528342e-05,
"loss": 0.4338,
"num_tokens": 283689384.0,
"step": 448
},
{
"epoch": 0.05309211304245004,
"grad_norm": 0.18873152136802673,
"learning_rate": 3.170588365287651e-05,
"loss": 0.3618,
"num_tokens": 284325980.0,
"step": 449
},
{
"epoch": 0.05321035828307911,
"grad_norm": 0.22596846520900726,
"learning_rate": 3.1602443472126344e-05,
"loss": 0.4158,
"num_tokens": 284927410.0,
"step": 450
},
{
"epoch": 0.05332860352370817,
"grad_norm": 0.20799311995506287,
"learning_rate": 3.1499023855054086e-05,
"loss": 0.4023,
"num_tokens": 285561853.0,
"step": 451
},
{
"epoch": 0.05344684876433724,
"grad_norm": 0.21219973266124725,
"learning_rate": 3.1395626323378266e-05,
"loss": 0.4127,
"num_tokens": 286200667.0,
"step": 452
},
{
"epoch": 0.0535650940049663,
"grad_norm": 0.23271985352039337,
"learning_rate": 3.129225239849247e-05,
"loss": 0.377,
"num_tokens": 286838297.0,
"step": 453
},
{
"epoch": 0.05368333924559537,
"grad_norm": 0.23054036498069763,
"learning_rate": 3.118890360144293e-05,
"loss": 0.3806,
"num_tokens": 287478013.0,
"step": 454
},
{
"epoch": 0.05380158448622443,
"grad_norm": 0.21830712258815765,
"learning_rate": 3.1085581452906166e-05,
"loss": 0.4124,
"num_tokens": 288113641.0,
"step": 455
},
{
"epoch": 0.053919829726853497,
"grad_norm": 0.21437396109104156,
"learning_rate": 3.0982287473166544e-05,
"loss": 0.4056,
"num_tokens": 288748606.0,
"step": 456
},
{
"epoch": 0.05403807496748256,
"grad_norm": 0.23408770561218262,
"learning_rate": 3.087902318209401e-05,
"loss": 0.3841,
"num_tokens": 289387862.0,
"step": 457
},
{
"epoch": 0.054156320208111626,
"grad_norm": 0.22132480144500732,
"learning_rate": 3.0775790099121615e-05,
"loss": 0.3859,
"num_tokens": 290025351.0,
"step": 458
},
{
"epoch": 0.05427456544874069,
"grad_norm": 0.21784645318984985,
"learning_rate": 3.067258974322328e-05,
"loss": 0.3752,
"num_tokens": 290660065.0,
"step": 459
},
{
"epoch": 0.054392810689369755,
"grad_norm": 0.21862168610095978,
"learning_rate": 3.056942363289131e-05,
"loss": 0.3808,
"num_tokens": 291293769.0,
"step": 460
},
{
"epoch": 0.05451105592999882,
"grad_norm": 0.23824048042297363,
"learning_rate": 3.0466293286114164e-05,
"loss": 0.3878,
"num_tokens": 291932970.0,
"step": 461
},
{
"epoch": 0.054629301170627885,
"grad_norm": 0.20896966755390167,
"learning_rate": 3.036320022035405e-05,
"loss": 0.3958,
"num_tokens": 292570196.0,
"step": 462
},
{
"epoch": 0.054747546411256946,
"grad_norm": 0.22659938037395477,
"learning_rate": 3.0260145952524658e-05,
"loss": 0.4281,
"num_tokens": 293208953.0,
"step": 463
},
{
"epoch": 0.054865791651886014,
"grad_norm": 0.25533124804496765,
"learning_rate": 3.0157131998968765e-05,
"loss": 0.3964,
"num_tokens": 293841194.0,
"step": 464
},
{
"epoch": 0.054984036892515076,
"grad_norm": 0.21840247511863708,
"learning_rate": 3.0054159875435977e-05,
"loss": 0.4031,
"num_tokens": 294478601.0,
"step": 465
},
{
"epoch": 0.055102282133144144,
"grad_norm": 0.20685090124607086,
"learning_rate": 2.995123109706042e-05,
"loss": 0.3979,
"num_tokens": 295110727.0,
"step": 466
},
{
"epoch": 0.055220527373773205,
"grad_norm": 0.24118992686271667,
"learning_rate": 2.984834717833843e-05,
"loss": 0.3724,
"num_tokens": 295744614.0,
"step": 467
},
{
"epoch": 0.05533877261440227,
"grad_norm": 0.21696403622627258,
"learning_rate": 2.9745509633106285e-05,
"loss": 0.3875,
"num_tokens": 296380861.0,
"step": 468
},
{
"epoch": 0.055457017855031335,
"grad_norm": 0.2347799837589264,
"learning_rate": 2.964271997451791e-05,
"loss": 0.3951,
"num_tokens": 297007279.0,
"step": 469
},
{
"epoch": 0.0555752630956604,
"grad_norm": 0.2174369990825653,
"learning_rate": 2.9539979715022626e-05,
"loss": 0.3757,
"num_tokens": 297623126.0,
"step": 470
},
{
"epoch": 0.055693508336289464,
"grad_norm": 0.2637596130371094,
"learning_rate": 2.943729036634291e-05,
"loss": 0.4343,
"num_tokens": 298255789.0,
"step": 471
},
{
"epoch": 0.05581175357691853,
"grad_norm": 0.20758095383644104,
"learning_rate": 2.9334653439452135e-05,
"loss": 0.4108,
"num_tokens": 298885491.0,
"step": 472
},
{
"epoch": 0.055929998817547594,
"grad_norm": 0.2174261063337326,
"learning_rate": 2.9232070444552315e-05,
"loss": 0.3799,
"num_tokens": 299521680.0,
"step": 473
},
{
"epoch": 0.056048244058176655,
"grad_norm": 0.23763810098171234,
"learning_rate": 2.9129542891051922e-05,
"loss": 0.3902,
"num_tokens": 300161168.0,
"step": 474
},
{
"epoch": 0.05616648929880572,
"grad_norm": 0.21108706295490265,
"learning_rate": 2.9027072287543666e-05,
"loss": 0.3795,
"num_tokens": 300793878.0,
"step": 475
},
{
"epoch": 0.056284734539434784,
"grad_norm": 0.22178317606449127,
"learning_rate": 2.89246601417823e-05,
"loss": 0.4045,
"num_tokens": 301427428.0,
"step": 476
},
{
"epoch": 0.05640297978006385,
"grad_norm": 0.23109114170074463,
"learning_rate": 2.8822307960662403e-05,
"loss": 0.4327,
"num_tokens": 302063334.0,
"step": 477
},
{
"epoch": 0.056521225020692914,
"grad_norm": 0.23473629355430603,
"learning_rate": 2.8720017250196266e-05,
"loss": 0.3764,
"num_tokens": 302696034.0,
"step": 478
},
{
"epoch": 0.05663947026132198,
"grad_norm": 0.22509372234344482,
"learning_rate": 2.861778951549167e-05,
"loss": 0.4,
"num_tokens": 303331655.0,
"step": 479
},
{
"epoch": 0.056757715501951043,
"grad_norm": 0.2507939040660858,
"learning_rate": 2.851562626072978e-05,
"loss": 0.4712,
"num_tokens": 303962802.0,
"step": 480
},
{
"epoch": 0.05687596074258011,
"grad_norm": 0.22742438316345215,
"learning_rate": 2.8413528989143004e-05,
"loss": 0.3897,
"num_tokens": 304595703.0,
"step": 481
},
{
"epoch": 0.05699420598320917,
"grad_norm": 0.2183639109134674,
"learning_rate": 2.8311499202992885e-05,
"loss": 0.3931,
"num_tokens": 305227732.0,
"step": 482
},
{
"epoch": 0.05711245122383824,
"grad_norm": 0.21615217626094818,
"learning_rate": 2.820953840354795e-05,
"loss": 0.3938,
"num_tokens": 305861092.0,
"step": 483
},
{
"epoch": 0.0572306964644673,
"grad_norm": 0.22431129217147827,
"learning_rate": 2.810764809106168e-05,
"loss": 0.3977,
"num_tokens": 306497173.0,
"step": 484
},
{
"epoch": 0.05734894170509637,
"grad_norm": 0.2126999795436859,
"learning_rate": 2.800582976475041e-05,
"loss": 0.3847,
"num_tokens": 307133773.0,
"step": 485
},
{
"epoch": 0.05746718694572543,
"grad_norm": 0.21983444690704346,
"learning_rate": 2.7904084922771254e-05,
"loss": 0.3773,
"num_tokens": 307760142.0,
"step": 486
},
{
"epoch": 0.0575854321863545,
"grad_norm": 0.20621925592422485,
"learning_rate": 2.7802415062200087e-05,
"loss": 0.4089,
"num_tokens": 308392658.0,
"step": 487
},
{
"epoch": 0.05770367742698356,
"grad_norm": 0.2080400586128235,
"learning_rate": 2.77008216790095e-05,
"loss": 0.3654,
"num_tokens": 309023200.0,
"step": 488
},
{
"epoch": 0.05782192266761263,
"grad_norm": 0.21669632196426392,
"learning_rate": 2.759930626804681e-05,
"loss": 0.4097,
"num_tokens": 309657191.0,
"step": 489
},
{
"epoch": 0.05794016790824169,
"grad_norm": 0.2028190642595291,
"learning_rate": 2.7497870323012014e-05,
"loss": 0.4037,
"num_tokens": 310290361.0,
"step": 490
},
{
"epoch": 0.05805841314887076,
"grad_norm": 0.23138827085494995,
"learning_rate": 2.7396515336435878e-05,
"loss": 0.4207,
"num_tokens": 310922697.0,
"step": 491
},
{
"epoch": 0.05817665838949982,
"grad_norm": 0.23582817614078522,
"learning_rate": 2.7295242799657938e-05,
"loss": 0.4111,
"num_tokens": 311557453.0,
"step": 492
},
{
"epoch": 0.05829490363012889,
"grad_norm": 0.20863734185695648,
"learning_rate": 2.7194054202804555e-05,
"loss": 0.4126,
"num_tokens": 312193193.0,
"step": 493
},
{
"epoch": 0.05841314887075795,
"grad_norm": 0.21243295073509216,
"learning_rate": 2.709295103476699e-05,
"loss": 0.4107,
"num_tokens": 312828473.0,
"step": 494
},
{
"epoch": 0.05853139411138702,
"grad_norm": 0.21561166644096375,
"learning_rate": 2.6991934783179515e-05,
"loss": 0.3824,
"num_tokens": 313462344.0,
"step": 495
},
{
"epoch": 0.05864963935201608,
"grad_norm": 0.23026616871356964,
"learning_rate": 2.6891006934397505e-05,
"loss": 0.3821,
"num_tokens": 314080640.0,
"step": 496
},
{
"epoch": 0.05876788459264515,
"grad_norm": 0.2129206657409668,
"learning_rate": 2.6790168973475585e-05,
"loss": 0.3938,
"num_tokens": 314717785.0,
"step": 497
},
{
"epoch": 0.05888612983327421,
"grad_norm": 0.23650778830051422,
"learning_rate": 2.6689422384145744e-05,
"loss": 0.4503,
"num_tokens": 315351322.0,
"step": 498
},
{
"epoch": 0.05900437507390328,
"grad_norm": 0.20518648624420166,
"learning_rate": 2.658876864879555e-05,
"loss": 0.4028,
"num_tokens": 315987690.0,
"step": 499
},
{
"epoch": 0.05912262031453234,
"grad_norm": 191.65394592285156,
"learning_rate": 2.648820924844631e-05,
"loss": 4.9729,
"num_tokens": 316588692.0,
"step": 500
},
{
"epoch": 0.05924086555516141,
"grad_norm": 0.27919653058052063,
"learning_rate": 2.6387745662731268e-05,
"loss": 0.3813,
"num_tokens": 317208507.0,
"step": 501
},
{
"epoch": 0.05935911079579047,
"grad_norm": 0.23787546157836914,
"learning_rate": 2.6287379369873878e-05,
"loss": 0.4319,
"num_tokens": 317844277.0,
"step": 502
},
{
"epoch": 0.059477356036419536,
"grad_norm": 0.24857866764068604,
"learning_rate": 2.6187111846666015e-05,
"loss": 0.4168,
"num_tokens": 318478032.0,
"step": 503
},
{
"epoch": 0.0595956012770486,
"grad_norm": 0.2471940517425537,
"learning_rate": 2.6086944568446233e-05,
"loss": 0.4461,
"num_tokens": 319114663.0,
"step": 504
},
{
"epoch": 0.059713846517677666,
"grad_norm": 0.23387108743190765,
"learning_rate": 2.5986879009078095e-05,
"loss": 0.3444,
"num_tokens": 319744940.0,
"step": 505
},
{
"epoch": 0.05983209175830673,
"grad_norm": 0.2251531183719635,
"learning_rate": 2.5886916640928474e-05,
"loss": 0.3914,
"num_tokens": 320377220.0,
"step": 506
},
{
"epoch": 0.059950336998935795,
"grad_norm": 0.22722141444683075,
"learning_rate": 2.57870589348459e-05,
"loss": 0.3942,
"num_tokens": 321010441.0,
"step": 507
},
{
"epoch": 0.060068582239564856,
"grad_norm": 0.22561167180538177,
"learning_rate": 2.568730736013887e-05,
"loss": 0.3771,
"num_tokens": 321643644.0,
"step": 508
},
{
"epoch": 0.060186827480193925,
"grad_norm": 0.22242951393127441,
"learning_rate": 2.5587663384554264e-05,
"loss": 0.3877,
"num_tokens": 322275355.0,
"step": 509
},
{
"epoch": 0.060305072720822986,
"grad_norm": 0.22118404507637024,
"learning_rate": 2.5488128474255777e-05,
"loss": 0.4112,
"num_tokens": 322908591.0,
"step": 510
},
{
"epoch": 0.060423317961452054,
"grad_norm": 0.2330280840396881,
"learning_rate": 2.5388704093802296e-05,
"loss": 0.4106,
"num_tokens": 323542459.0,
"step": 511
},
{
"epoch": 0.060541563202081115,
"grad_norm": 0.2232893407344818,
"learning_rate": 2.5289391706126375e-05,
"loss": 0.3905,
"num_tokens": 324176254.0,
"step": 512
},
{
"epoch": 0.060659808442710184,
"grad_norm": 0.20871341228485107,
"learning_rate": 2.5190192772512675e-05,
"loss": 0.3664,
"num_tokens": 324803884.0,
"step": 513
},
{
"epoch": 0.060778053683339245,
"grad_norm": 0.2513749599456787,
"learning_rate": 2.509110875257654e-05,
"loss": 0.4212,
"num_tokens": 325442394.0,
"step": 514
},
{
"epoch": 0.06089629892396831,
"grad_norm": 0.2195710688829422,
"learning_rate": 2.4992141104242444e-05,
"loss": 0.4119,
"num_tokens": 326081004.0,
"step": 515
},
{
"epoch": 0.061014544164597374,
"grad_norm": 0.2243558168411255,
"learning_rate": 2.4893291283722552e-05,
"loss": 0.3926,
"num_tokens": 326711646.0,
"step": 516
},
{
"epoch": 0.06113278940522644,
"grad_norm": 0.2674740254878998,
"learning_rate": 2.479456074549534e-05,
"loss": 0.387,
"num_tokens": 327346340.0,
"step": 517
},
{
"epoch": 0.061251034645855504,
"grad_norm": 0.19878649711608887,
"learning_rate": 2.469595094228415e-05,
"loss": 0.3669,
"num_tokens": 327982364.0,
"step": 518
},
{
"epoch": 0.06136927988648457,
"grad_norm": 0.24535562098026276,
"learning_rate": 2.4597463325035814e-05,
"loss": 0.4298,
"num_tokens": 328613513.0,
"step": 519
},
{
"epoch": 0.06148752512711363,
"grad_norm": 0.2603405714035034,
"learning_rate": 2.4499099342899335e-05,
"loss": 0.4045,
"num_tokens": 329246797.0,
"step": 520
},
{
"epoch": 0.0616057703677427,
"grad_norm": 0.2385442852973938,
"learning_rate": 2.4400860443204524e-05,
"loss": 0.4146,
"num_tokens": 329882051.0,
"step": 521
},
{
"epoch": 0.06172401560837176,
"grad_norm": 0.2290627807378769,
"learning_rate": 2.4302748071440763e-05,
"loss": 0.3872,
"num_tokens": 330518781.0,
"step": 522
},
{
"epoch": 0.06184226084900083,
"grad_norm": 0.22756050527095795,
"learning_rate": 2.4204763671235655e-05,
"loss": 0.3926,
"num_tokens": 331148822.0,
"step": 523
},
{
"epoch": 0.06196050608962989,
"grad_norm": 0.24352355301380157,
"learning_rate": 2.4106908684333856e-05,
"loss": 0.4102,
"num_tokens": 331779530.0,
"step": 524
},
{
"epoch": 0.06207875133025896,
"grad_norm": 0.24109165370464325,
"learning_rate": 2.4009184550575824e-05,
"loss": 0.3531,
"num_tokens": 332409781.0,
"step": 525
},
{
"epoch": 0.06219699657088802,
"grad_norm": 0.2488730251789093,
"learning_rate": 2.3911592707876643e-05,
"loss": 0.4369,
"num_tokens": 333047880.0,
"step": 526
},
{
"epoch": 0.06231524181151708,
"grad_norm": 0.19400961697101593,
"learning_rate": 2.381413459220485e-05,
"loss": 0.4013,
"num_tokens": 333683586.0,
"step": 527
},
{
"epoch": 0.06243348705214615,
"grad_norm": 0.23837246000766754,
"learning_rate": 2.371681163756134e-05,
"loss": 0.4032,
"num_tokens": 334315506.0,
"step": 528
},
{
"epoch": 0.06255173229277522,
"grad_norm": 0.21308279037475586,
"learning_rate": 2.361962527595824e-05,
"loss": 0.3715,
"num_tokens": 334951291.0,
"step": 529
},
{
"epoch": 0.06266997753340428,
"grad_norm": 0.2166317254304886,
"learning_rate": 2.352257693739783e-05,
"loss": 0.3831,
"num_tokens": 335585867.0,
"step": 530
},
{
"epoch": 0.06278822277403334,
"grad_norm": 0.22203749418258667,
"learning_rate": 2.3425668049851535e-05,
"loss": 0.3727,
"num_tokens": 336220052.0,
"step": 531
},
{
"epoch": 0.0629064680146624,
"grad_norm": 0.20896011590957642,
"learning_rate": 2.3328900039238882e-05,
"loss": 0.4235,
"num_tokens": 336856340.0,
"step": 532
},
{
"epoch": 0.06302471325529148,
"grad_norm": 0.19895263016223907,
"learning_rate": 2.323227432940654e-05,
"loss": 0.3766,
"num_tokens": 337491980.0,
"step": 533
},
{
"epoch": 0.06314295849592054,
"grad_norm": 0.21203070878982544,
"learning_rate": 2.3135792342107335e-05,
"loss": 0.3798,
"num_tokens": 338130649.0,
"step": 534
},
{
"epoch": 0.0632612037365496,
"grad_norm": 0.24771364033222198,
"learning_rate": 2.3039455496979403e-05,
"loss": 0.4252,
"num_tokens": 338765865.0,
"step": 535
},
{
"epoch": 0.06337944897717866,
"grad_norm": 0.2177121639251709,
"learning_rate": 2.294326521152522e-05,
"loss": 0.3969,
"num_tokens": 339363366.0,
"step": 536
},
{
"epoch": 0.06349769421780774,
"grad_norm": 0.2345789521932602,
"learning_rate": 2.2847222901090787e-05,
"loss": 0.4415,
"num_tokens": 339999650.0,
"step": 537
},
{
"epoch": 0.0636159394584368,
"grad_norm": 0.21585899591445923,
"learning_rate": 2.2751329978844802e-05,
"loss": 0.4005,
"num_tokens": 340634297.0,
"step": 538
},
{
"epoch": 0.06373418469906586,
"grad_norm": 0.23928019404411316,
"learning_rate": 2.2655587855757862e-05,
"loss": 0.4249,
"num_tokens": 341269246.0,
"step": 539
},
{
"epoch": 0.06385242993969492,
"grad_norm": 0.2342565506696701,
"learning_rate": 2.255999794058169e-05,
"loss": 0.4108,
"num_tokens": 341900107.0,
"step": 540
},
{
"epoch": 0.063970675180324,
"grad_norm": 0.2086341232061386,
"learning_rate": 2.246456163982845e-05,
"loss": 0.4149,
"num_tokens": 342539152.0,
"step": 541
},
{
"epoch": 0.06408892042095306,
"grad_norm": 0.20828045904636383,
"learning_rate": 2.236928035774997e-05,
"loss": 0.4131,
"num_tokens": 343166271.0,
"step": 542
},
{
"epoch": 0.06420716566158212,
"grad_norm": 0.20667296648025513,
"learning_rate": 2.2274155496317174e-05,
"loss": 0.3735,
"num_tokens": 343801657.0,
"step": 543
},
{
"epoch": 0.06432541090221118,
"grad_norm": 0.20303893089294434,
"learning_rate": 2.217918845519939e-05,
"loss": 0.3877,
"num_tokens": 344436926.0,
"step": 544
},
{
"epoch": 0.06444365614284026,
"grad_norm": 0.1928926706314087,
"learning_rate": 2.208438063174377e-05,
"loss": 0.3732,
"num_tokens": 345071661.0,
"step": 545
},
{
"epoch": 0.06456190138346932,
"grad_norm": 0.24890753626823425,
"learning_rate": 2.1989733420954752e-05,
"loss": 0.4558,
"num_tokens": 345710262.0,
"step": 546
},
{
"epoch": 0.06468014662409838,
"grad_norm": 0.21143190562725067,
"learning_rate": 2.1895248215473494e-05,
"loss": 0.396,
"num_tokens": 346345760.0,
"step": 547
},
{
"epoch": 0.06479839186472744,
"grad_norm": 0.20359720289707184,
"learning_rate": 2.1800926405557425e-05,
"loss": 0.3731,
"num_tokens": 346978472.0,
"step": 548
},
{
"epoch": 0.06491663710535651,
"grad_norm": 0.22057001292705536,
"learning_rate": 2.1706769379059748e-05,
"loss": 0.3875,
"num_tokens": 347615067.0,
"step": 549
},
{
"epoch": 0.06503488234598558,
"grad_norm": 0.21384279429912567,
"learning_rate": 2.161277852140905e-05,
"loss": 0.4085,
"num_tokens": 348251545.0,
"step": 550
},
{
"epoch": 0.06515312758661464,
"grad_norm": 0.2024473398923874,
"learning_rate": 2.151895521558892e-05,
"loss": 0.3993,
"num_tokens": 348888946.0,
"step": 551
},
{
"epoch": 0.0652713728272437,
"grad_norm": 0.23349706828594208,
"learning_rate": 2.1425300842117537e-05,
"loss": 0.4371,
"num_tokens": 349519613.0,
"step": 552
},
{
"epoch": 0.06538961806787277,
"grad_norm": 0.20403353869915009,
"learning_rate": 2.133181677902747e-05,
"loss": 0.39,
"num_tokens": 350152254.0,
"step": 553
},
{
"epoch": 0.06550786330850183,
"grad_norm": 0.21594958007335663,
"learning_rate": 2.1238504401845306e-05,
"loss": 0.3878,
"num_tokens": 350786547.0,
"step": 554
},
{
"epoch": 0.0656261085491309,
"grad_norm": 0.2234022170305252,
"learning_rate": 2.1145365083571418e-05,
"loss": 0.3961,
"num_tokens": 351422383.0,
"step": 555
},
{
"epoch": 0.06574435378975996,
"grad_norm": 0.21230868995189667,
"learning_rate": 2.105240019465984e-05,
"loss": 0.4062,
"num_tokens": 352061087.0,
"step": 556
},
{
"epoch": 0.06586259903038903,
"grad_norm": 0.21539685130119324,
"learning_rate": 2.095961110299799e-05,
"loss": 0.3688,
"num_tokens": 352697788.0,
"step": 557
},
{
"epoch": 0.0659808442710181,
"grad_norm": 0.2293699085712433,
"learning_rate": 2.086699917388664e-05,
"loss": 0.3892,
"num_tokens": 353327836.0,
"step": 558
},
{
"epoch": 0.06609908951164715,
"grad_norm": 0.2052561193704605,
"learning_rate": 2.0774565770019797e-05,
"loss": 0.38,
"num_tokens": 353963351.0,
"step": 559
},
{
"epoch": 0.06621733475227622,
"grad_norm": 0.2142726480960846,
"learning_rate": 2.06823122514646e-05,
"loss": 0.4144,
"num_tokens": 354576403.0,
"step": 560
},
{
"epoch": 0.06633557999290529,
"grad_norm": 0.22821441292762756,
"learning_rate": 2.0590239975641387e-05,
"loss": 0.4167,
"num_tokens": 355211385.0,
"step": 561
},
{
"epoch": 0.06645382523353435,
"grad_norm": 0.2132827341556549,
"learning_rate": 2.0498350297303682e-05,
"loss": 0.4177,
"num_tokens": 355842242.0,
"step": 562
},
{
"epoch": 0.06657207047416341,
"grad_norm": 0.21103453636169434,
"learning_rate": 2.0406644568518244e-05,
"loss": 0.3693,
"num_tokens": 356473678.0,
"step": 563
},
{
"epoch": 0.06669031571479248,
"grad_norm": 0.20970512926578522,
"learning_rate": 2.031512413864523e-05,
"loss": 0.3613,
"num_tokens": 357112492.0,
"step": 564
},
{
"epoch": 0.06680856095542155,
"grad_norm": 0.25300124287605286,
"learning_rate": 2.0223790354318263e-05,
"loss": 0.3903,
"num_tokens": 357748442.0,
"step": 565
},
{
"epoch": 0.06692680619605061,
"grad_norm": 0.18217869102954865,
"learning_rate": 2.013264455942469e-05,
"loss": 0.3531,
"num_tokens": 358387012.0,
"step": 566
},
{
"epoch": 0.06704505143667967,
"grad_norm": 0.202724426984787,
"learning_rate": 2.0041688095085776e-05,
"loss": 0.3667,
"num_tokens": 359025550.0,
"step": 567
},
{
"epoch": 0.06716329667730873,
"grad_norm": 0.2007199376821518,
"learning_rate": 1.9950922299636945e-05,
"loss": 0.407,
"num_tokens": 359661915.0,
"step": 568
},
{
"epoch": 0.06728154191793781,
"grad_norm": 0.212602898478508,
"learning_rate": 1.986034850860815e-05,
"loss": 0.3709,
"num_tokens": 360295739.0,
"step": 569
},
{
"epoch": 0.06739978715856687,
"grad_norm": 0.20929577946662903,
"learning_rate": 1.9769968054704174e-05,
"loss": 0.4242,
"num_tokens": 360929829.0,
"step": 570
},
{
"epoch": 0.06751803239919593,
"grad_norm": 0.19647814333438873,
"learning_rate": 1.9679782267785006e-05,
"loss": 0.3632,
"num_tokens": 361568418.0,
"step": 571
},
{
"epoch": 0.067636277639825,
"grad_norm": 0.22293861210346222,
"learning_rate": 1.9589792474846353e-05,
"loss": 0.3513,
"num_tokens": 362197303.0,
"step": 572
},
{
"epoch": 0.06775452288045407,
"grad_norm": 0.24212084710597992,
"learning_rate": 1.9500000000000006e-05,
"loss": 0.393,
"num_tokens": 362825904.0,
"step": 573
},
{
"epoch": 0.06787276812108313,
"grad_norm": 0.20932357013225555,
"learning_rate": 1.9410406164454458e-05,
"loss": 0.3854,
"num_tokens": 363465140.0,
"step": 574
},
{
"epoch": 0.06799101336171219,
"grad_norm": 0.20063243806362152,
"learning_rate": 1.9321012286495403e-05,
"loss": 0.3874,
"num_tokens": 364097168.0,
"step": 575
},
{
"epoch": 0.06810925860234125,
"grad_norm": 0.1878458708524704,
"learning_rate": 1.9231819681466337e-05,
"loss": 0.3658,
"num_tokens": 364728470.0,
"step": 576
},
{
"epoch": 0.06822750384297033,
"grad_norm": 0.2246370166540146,
"learning_rate": 1.914282966174925e-05,
"loss": 0.4268,
"num_tokens": 365363497.0,
"step": 577
},
{
"epoch": 0.06834574908359939,
"grad_norm": 0.24067296087741852,
"learning_rate": 1.9054043536745268e-05,
"loss": 0.4456,
"num_tokens": 366000699.0,
"step": 578
},
{
"epoch": 0.06846399432422845,
"grad_norm": 0.18838095664978027,
"learning_rate": 1.8965462612855428e-05,
"loss": 0.3526,
"num_tokens": 366624851.0,
"step": 579
},
{
"epoch": 0.06858223956485751,
"grad_norm": 0.1913178265094757,
"learning_rate": 1.8877088193461407e-05,
"loss": 0.3845,
"num_tokens": 367261099.0,
"step": 580
},
{
"epoch": 0.06870048480548657,
"grad_norm": 0.20684710144996643,
"learning_rate": 1.878892157890638e-05,
"loss": 0.3567,
"num_tokens": 367897458.0,
"step": 581
},
{
"epoch": 0.06881873004611565,
"grad_norm": 0.21800653636455536,
"learning_rate": 1.8700964066475868e-05,
"loss": 0.4243,
"num_tokens": 368534927.0,
"step": 582
},
{
"epoch": 0.06893697528674471,
"grad_norm": 0.21104471385478973,
"learning_rate": 1.86132169503787e-05,
"loss": 0.4147,
"num_tokens": 369169358.0,
"step": 583
},
{
"epoch": 0.06905522052737377,
"grad_norm": 0.20770899951457977,
"learning_rate": 1.8525681521727856e-05,
"loss": 0.405,
"num_tokens": 369806601.0,
"step": 584
},
{
"epoch": 0.06917346576800283,
"grad_norm": 0.20592570304870605,
"learning_rate": 1.8438359068521625e-05,
"loss": 0.3933,
"num_tokens": 370442728.0,
"step": 585
},
{
"epoch": 0.0692917110086319,
"grad_norm": 0.20783546566963196,
"learning_rate": 1.83512508756245e-05,
"loss": 0.4044,
"num_tokens": 371079275.0,
"step": 586
},
{
"epoch": 0.06940995624926097,
"grad_norm": 0.20856884121894836,
"learning_rate": 1.8264358224748374e-05,
"loss": 0.3986,
"num_tokens": 371716282.0,
"step": 587
},
{
"epoch": 0.06952820148989003,
"grad_norm": 0.19124871492385864,
"learning_rate": 1.817768239443367e-05,
"loss": 0.4001,
"num_tokens": 372347661.0,
"step": 588
},
{
"epoch": 0.06964644673051909,
"grad_norm": 0.22391672432422638,
"learning_rate": 1.8091224660030457e-05,
"loss": 0.3906,
"num_tokens": 372977936.0,
"step": 589
},
{
"epoch": 0.06976469197114817,
"grad_norm": 0.22682306170463562,
"learning_rate": 1.8004986293679783e-05,
"loss": 0.4097,
"num_tokens": 373613192.0,
"step": 590
},
{
"epoch": 0.06988293721177723,
"grad_norm": 0.1943192332983017,
"learning_rate": 1.79189685642949e-05,
"loss": 0.4018,
"num_tokens": 374251716.0,
"step": 591
},
{
"epoch": 0.07000118245240629,
"grad_norm": 0.1957077533006668,
"learning_rate": 1.7833172737542572e-05,
"loss": 0.359,
"num_tokens": 374880470.0,
"step": 592
},
{
"epoch": 0.07011942769303535,
"grad_norm": 0.21087896823883057,
"learning_rate": 1.774760007582453e-05,
"loss": 0.399,
"num_tokens": 375518014.0,
"step": 593
},
{
"epoch": 0.07023767293366442,
"grad_norm": 0.20506969094276428,
"learning_rate": 1.76622518382588e-05,
"loss": 0.4005,
"num_tokens": 376156899.0,
"step": 594
},
{
"epoch": 0.07035591817429349,
"grad_norm": 0.18575182557106018,
"learning_rate": 1.7577129280661264e-05,
"loss": 0.3837,
"num_tokens": 376796416.0,
"step": 595
},
{
"epoch": 0.07047416341492255,
"grad_norm": 0.20734459161758423,
"learning_rate": 1.7492233655527138e-05,
"loss": 0.3834,
"num_tokens": 377430299.0,
"step": 596
},
{
"epoch": 0.07059240865555161,
"grad_norm": 0.18199484050273895,
"learning_rate": 1.7407566212012526e-05,
"loss": 0.3334,
"num_tokens": 378036010.0,
"step": 597
},
{
"epoch": 0.07071065389618068,
"grad_norm": 0.21089966595172882,
"learning_rate": 1.7323128195916088e-05,
"loss": 0.4233,
"num_tokens": 378662576.0,
"step": 598
},
{
"epoch": 0.07082889913680974,
"grad_norm": 0.19139453768730164,
"learning_rate": 1.723892084966068e-05,
"loss": 0.3544,
"num_tokens": 379292706.0,
"step": 599
},
{
"epoch": 0.0709471443774388,
"grad_norm": 0.20988748967647552,
"learning_rate": 1.7154945412275056e-05,
"loss": 0.4113,
"num_tokens": 379923752.0,
"step": 600
},
{
"epoch": 0.07106538961806787,
"grad_norm": 0.21000663936138153,
"learning_rate": 1.7071203119375692e-05,
"loss": 0.3831,
"num_tokens": 380556540.0,
"step": 601
},
{
"epoch": 0.07118363485869694,
"grad_norm": 0.187398761510849,
"learning_rate": 1.698769520314853e-05,
"loss": 0.3572,
"num_tokens": 381191645.0,
"step": 602
},
{
"epoch": 0.071301880099326,
"grad_norm": 0.1953067183494568,
"learning_rate": 1.6904422892330918e-05,
"loss": 0.4128,
"num_tokens": 381827763.0,
"step": 603
},
{
"epoch": 0.07142012533995507,
"grad_norm": 0.19437581300735474,
"learning_rate": 1.68213874121935e-05,
"loss": 0.379,
"num_tokens": 382466825.0,
"step": 604
},
{
"epoch": 0.07153837058058413,
"grad_norm": 0.21022436022758484,
"learning_rate": 1.6738589984522172e-05,
"loss": 0.3804,
"num_tokens": 383103907.0,
"step": 605
},
{
"epoch": 0.0716566158212132,
"grad_norm": 0.2030460089445114,
"learning_rate": 1.665603182760014e-05,
"loss": 0.4009,
"num_tokens": 383736705.0,
"step": 606
},
{
"epoch": 0.07177486106184226,
"grad_norm": 0.21273180842399597,
"learning_rate": 1.657371415618996e-05,
"loss": 0.4078,
"num_tokens": 384376310.0,
"step": 607
},
{
"epoch": 0.07189310630247132,
"grad_norm": 0.184920996427536,
"learning_rate": 1.6491638181515668e-05,
"loss": 0.3793,
"num_tokens": 385007094.0,
"step": 608
},
{
"epoch": 0.07201135154310039,
"grad_norm": 0.18787135183811188,
"learning_rate": 1.6409805111245015e-05,
"loss": 0.3604,
"num_tokens": 385646534.0,
"step": 609
},
{
"epoch": 0.07212959678372946,
"grad_norm": 0.18394650518894196,
"learning_rate": 1.632821614947159e-05,
"loss": 0.3549,
"num_tokens": 386283106.0,
"step": 610
},
{
"epoch": 0.07224784202435852,
"grad_norm": 0.18069717288017273,
"learning_rate": 1.624687249669722e-05,
"loss": 0.3509,
"num_tokens": 386916395.0,
"step": 611
},
{
"epoch": 0.07236608726498758,
"grad_norm": 0.2196332812309265,
"learning_rate": 1.6165775349814197e-05,
"loss": 0.3995,
"num_tokens": 387553614.0,
"step": 612
},
{
"epoch": 0.07248433250561664,
"grad_norm": 0.20063099265098572,
"learning_rate": 1.608492590208777e-05,
"loss": 0.3657,
"num_tokens": 388189908.0,
"step": 613
},
{
"epoch": 0.07260257774624572,
"grad_norm": 0.2032419592142105,
"learning_rate": 1.6004325343138506e-05,
"loss": 0.4057,
"num_tokens": 388827274.0,
"step": 614
},
{
"epoch": 0.07272082298687478,
"grad_norm": 0.2009783685207367,
"learning_rate": 1.5923974858924816e-05,
"loss": 0.3746,
"num_tokens": 389460786.0,
"step": 615
},
{
"epoch": 0.07283906822750384,
"grad_norm": 0.19908788800239563,
"learning_rate": 1.5843875631725528e-05,
"loss": 0.3981,
"num_tokens": 390066154.0,
"step": 616
},
{
"epoch": 0.0729573134681329,
"grad_norm": 0.1834346354007721,
"learning_rate": 1.5764028840122463e-05,
"loss": 0.3648,
"num_tokens": 390700370.0,
"step": 617
},
{
"epoch": 0.07307555870876198,
"grad_norm": 0.2006380409002304,
"learning_rate": 1.568443565898307e-05,
"loss": 0.3693,
"num_tokens": 391333815.0,
"step": 618
},
{
"epoch": 0.07319380394939104,
"grad_norm": 0.1987355351448059,
"learning_rate": 1.5605097259443196e-05,
"loss": 0.3864,
"num_tokens": 391972641.0,
"step": 619
},
{
"epoch": 0.0733120491900201,
"grad_norm": 0.20351499319076538,
"learning_rate": 1.5526014808889836e-05,
"loss": 0.4113,
"num_tokens": 392607122.0,
"step": 620
},
{
"epoch": 0.07343029443064916,
"grad_norm": 0.20003947615623474,
"learning_rate": 1.5447189470943905e-05,
"loss": 0.3607,
"num_tokens": 393234821.0,
"step": 621
},
{
"epoch": 0.07354853967127824,
"grad_norm": 0.20422472059726715,
"learning_rate": 1.536862240544321e-05,
"loss": 0.3633,
"num_tokens": 393867338.0,
"step": 622
},
{
"epoch": 0.0736667849119073,
"grad_norm": 0.18243864178657532,
"learning_rate": 1.5290314768425274e-05,
"loss": 0.3752,
"num_tokens": 394505968.0,
"step": 623
},
{
"epoch": 0.07378503015253636,
"grad_norm": 0.22229041159152985,
"learning_rate": 1.5212267712110427e-05,
"loss": 0.4205,
"num_tokens": 395143798.0,
"step": 624
},
{
"epoch": 0.07390327539316542,
"grad_norm": 0.19298569858074188,
"learning_rate": 1.5134482384884803e-05,
"loss": 0.368,
"num_tokens": 395781916.0,
"step": 625
},
{
"epoch": 0.0740215206337945,
"grad_norm": 0.20785243809223175,
"learning_rate": 1.5056959931283423e-05,
"loss": 0.4121,
"num_tokens": 396419440.0,
"step": 626
},
{
"epoch": 0.07413976587442356,
"grad_norm": 0.19097504019737244,
"learning_rate": 1.4979701491973403e-05,
"loss": 0.3539,
"num_tokens": 397053137.0,
"step": 627
},
{
"epoch": 0.07425801111505262,
"grad_norm": 0.2216179519891739,
"learning_rate": 1.490270820373715e-05,
"loss": 0.3927,
"num_tokens": 397685003.0,
"step": 628
},
{
"epoch": 0.07437625635568168,
"grad_norm": 0.21564562618732452,
"learning_rate": 1.4825981199455601e-05,
"loss": 0.4046,
"num_tokens": 398323686.0,
"step": 629
},
{
"epoch": 0.07449450159631076,
"grad_norm": 0.20918431878089905,
"learning_rate": 1.4749521608091632e-05,
"loss": 0.4025,
"num_tokens": 398958685.0,
"step": 630
},
{
"epoch": 0.07461274683693982,
"grad_norm": 0.2055424153804779,
"learning_rate": 1.4673330554673358e-05,
"loss": 0.3961,
"num_tokens": 399595823.0,
"step": 631
},
{
"epoch": 0.07473099207756888,
"grad_norm": 0.19133684039115906,
"learning_rate": 1.459740916027765e-05,
"loss": 0.3868,
"num_tokens": 400234363.0,
"step": 632
},
{
"epoch": 0.07484923731819794,
"grad_norm": 0.20725229382514954,
"learning_rate": 1.4521758542013575e-05,
"loss": 0.3999,
"num_tokens": 400860312.0,
"step": 633
},
{
"epoch": 0.074967482558827,
"grad_norm": 0.19468720257282257,
"learning_rate": 1.4446379813006028e-05,
"loss": 0.3931,
"num_tokens": 401493314.0,
"step": 634
},
{
"epoch": 0.07508572779945608,
"grad_norm": 0.21458375453948975,
"learning_rate": 1.4371274082379317e-05,
"loss": 0.4047,
"num_tokens": 402131410.0,
"step": 635
},
{
"epoch": 0.07520397304008514,
"grad_norm": 0.21077150106430054,
"learning_rate": 1.4296442455240818e-05,
"loss": 0.4181,
"num_tokens": 402767694.0,
"step": 636
},
{
"epoch": 0.0753222182807142,
"grad_norm": 0.21079093217849731,
"learning_rate": 1.4221886032664769e-05,
"loss": 0.4037,
"num_tokens": 403401170.0,
"step": 637
},
{
"epoch": 0.07544046352134326,
"grad_norm": 0.1916537582874298,
"learning_rate": 1.4147605911676037e-05,
"loss": 0.3909,
"num_tokens": 404033223.0,
"step": 638
},
{
"epoch": 0.07555870876197233,
"grad_norm": 0.17826271057128906,
"learning_rate": 1.4073603185233966e-05,
"loss": 0.3837,
"num_tokens": 404669142.0,
"step": 639
},
{
"epoch": 0.0756769540026014,
"grad_norm": 0.18769319355487823,
"learning_rate": 1.3999878942216336e-05,
"loss": 0.3976,
"num_tokens": 405305698.0,
"step": 640
},
{
"epoch": 0.07579519924323046,
"grad_norm": 0.21683697402477264,
"learning_rate": 1.3926434267403286e-05,
"loss": 0.4228,
"num_tokens": 405935366.0,
"step": 641
},
{
"epoch": 0.07591344448385952,
"grad_norm": 0.1858586221933365,
"learning_rate": 1.3853270241461407e-05,
"loss": 0.3949,
"num_tokens": 406563939.0,
"step": 642
},
{
"epoch": 0.0760316897244886,
"grad_norm": 0.1963283121585846,
"learning_rate": 1.378038794092781e-05,
"loss": 0.3806,
"num_tokens": 407201876.0,
"step": 643
},
{
"epoch": 0.07614993496511765,
"grad_norm": 0.1992059051990509,
"learning_rate": 1.3707788438194276e-05,
"loss": 0.3715,
"num_tokens": 407834876.0,
"step": 644
},
{
"epoch": 0.07626818020574672,
"grad_norm": 0.19572339951992035,
"learning_rate": 1.3635472801491516e-05,
"loss": 0.3752,
"num_tokens": 408474126.0,
"step": 645
},
{
"epoch": 0.07638642544637578,
"grad_norm": 0.185529887676239,
"learning_rate": 1.3563442094873424e-05,
"loss": 0.3354,
"num_tokens": 409110752.0,
"step": 646
},
{
"epoch": 0.07650467068700485,
"grad_norm": 0.20446783304214478,
"learning_rate": 1.349169737820141e-05,
"loss": 0.3986,
"num_tokens": 409744230.0,
"step": 647
},
{
"epoch": 0.07662291592763391,
"grad_norm": 0.20862102508544922,
"learning_rate": 1.3420239707128845e-05,
"loss": 0.3885,
"num_tokens": 410377730.0,
"step": 648
},
{
"epoch": 0.07674116116826298,
"grad_norm": 0.19482731819152832,
"learning_rate": 1.3349070133085478e-05,
"loss": 0.369,
"num_tokens": 411014041.0,
"step": 649
},
{
"epoch": 0.07685940640889204,
"grad_norm": 0.1799471527338028,
"learning_rate": 1.327818970326202e-05,
"loss": 0.377,
"num_tokens": 411653738.0,
"step": 650
},
{
"epoch": 0.07697765164952111,
"grad_norm": 0.17572778463363647,
"learning_rate": 1.3207599460594695e-05,
"loss": 0.347,
"num_tokens": 412288459.0,
"step": 651
},
{
"epoch": 0.07709589689015017,
"grad_norm": 0.18439733982086182,
"learning_rate": 1.31373004437499e-05,
"loss": 0.3861,
"num_tokens": 412924573.0,
"step": 652
},
{
"epoch": 0.07721414213077923,
"grad_norm": 0.18092259764671326,
"learning_rate": 1.3067293687108938e-05,
"loss": 0.3428,
"num_tokens": 413557882.0,
"step": 653
},
{
"epoch": 0.0773323873714083,
"grad_norm": 0.17916624248027802,
"learning_rate": 1.2997580220752791e-05,
"loss": 0.3431,
"num_tokens": 414190765.0,
"step": 654
},
{
"epoch": 0.07745063261203737,
"grad_norm": 0.18362957239151,
"learning_rate": 1.2928161070446937e-05,
"loss": 0.3517,
"num_tokens": 414824481.0,
"step": 655
},
{
"epoch": 0.07756887785266643,
"grad_norm": 0.18938778340816498,
"learning_rate": 1.2859037257626331e-05,
"loss": 0.3749,
"num_tokens": 415462470.0,
"step": 656
},
{
"epoch": 0.0776871230932955,
"grad_norm": 0.18327617645263672,
"learning_rate": 1.2790209799380269e-05,
"loss": 0.4054,
"num_tokens": 416098823.0,
"step": 657
},
{
"epoch": 0.07780536833392455,
"grad_norm": 0.18833239376544952,
"learning_rate": 1.2721679708437516e-05,
"loss": 0.3851,
"num_tokens": 416727909.0,
"step": 658
},
{
"epoch": 0.07792361357455363,
"grad_norm": 0.21469521522521973,
"learning_rate": 1.2653447993151367e-05,
"loss": 0.4095,
"num_tokens": 417362676.0,
"step": 659
},
{
"epoch": 0.07804185881518269,
"grad_norm": 0.1744416207075119,
"learning_rate": 1.2585515657484778e-05,
"loss": 0.3625,
"num_tokens": 417996258.0,
"step": 660
},
{
"epoch": 0.07816010405581175,
"grad_norm": 0.19375596940517426,
"learning_rate": 1.2517883700995673e-05,
"loss": 0.4059,
"num_tokens": 418626034.0,
"step": 661
},
{
"epoch": 0.07827834929644081,
"grad_norm": 0.19247286021709442,
"learning_rate": 1.2450553118822141e-05,
"loss": 0.4297,
"num_tokens": 419263225.0,
"step": 662
},
{
"epoch": 0.07839659453706989,
"grad_norm": 0.18751764297485352,
"learning_rate": 1.238352490166789e-05,
"loss": 0.3912,
"num_tokens": 419892898.0,
"step": 663
},
{
"epoch": 0.07851483977769895,
"grad_norm": 0.1725941002368927,
"learning_rate": 1.2316800035787598e-05,
"loss": 0.3779,
"num_tokens": 420527528.0,
"step": 664
},
{
"epoch": 0.07863308501832801,
"grad_norm": 0.17689573764801025,
"learning_rate": 1.2250379502972414e-05,
"loss": 0.3802,
"num_tokens": 421156121.0,
"step": 665
},
{
"epoch": 0.07875133025895707,
"grad_norm": 0.17760339379310608,
"learning_rate": 1.2184264280535551e-05,
"loss": 0.3315,
"num_tokens": 421790061.0,
"step": 666
},
{
"epoch": 0.07886957549958615,
"grad_norm": 0.18165592849254608,
"learning_rate": 1.2118455341297868e-05,
"loss": 0.39,
"num_tokens": 422426991.0,
"step": 667
},
{
"epoch": 0.07898782074021521,
"grad_norm": 0.19043633341789246,
"learning_rate": 1.2052953653573545e-05,
"loss": 0.3475,
"num_tokens": 423063834.0,
"step": 668
},
{
"epoch": 0.07910606598084427,
"grad_norm": 0.1863887906074524,
"learning_rate": 1.1987760181155897e-05,
"loss": 0.3814,
"num_tokens": 423703537.0,
"step": 669
},
{
"epoch": 0.07922431122147333,
"grad_norm": 0.19008512794971466,
"learning_rate": 1.1922875883303112e-05,
"loss": 0.3986,
"num_tokens": 424330180.0,
"step": 670
},
{
"epoch": 0.0793425564621024,
"grad_norm": 0.1887669861316681,
"learning_rate": 1.1858301714724201e-05,
"loss": 0.4111,
"num_tokens": 424966976.0,
"step": 671
},
{
"epoch": 0.07946080170273147,
"grad_norm": 0.20428043603897095,
"learning_rate": 1.1794038625564926e-05,
"loss": 0.3843,
"num_tokens": 425604191.0,
"step": 672
},
{
"epoch": 0.07957904694336053,
"grad_norm": 0.20065979659557343,
"learning_rate": 1.1730087561393799e-05,
"loss": 0.3345,
"num_tokens": 426240218.0,
"step": 673
},
{
"epoch": 0.07969729218398959,
"grad_norm": 0.18954698741436005,
"learning_rate": 1.1666449463188212e-05,
"loss": 0.3979,
"num_tokens": 426878525.0,
"step": 674
},
{
"epoch": 0.07981553742461867,
"grad_norm": 0.19051022827625275,
"learning_rate": 1.1603125267320565e-05,
"loss": 0.3658,
"num_tokens": 427512790.0,
"step": 675
},
{
"epoch": 0.07993378266524773,
"grad_norm": 0.21397797763347626,
"learning_rate": 1.1540115905544473e-05,
"loss": 0.4099,
"num_tokens": 428150456.0,
"step": 676
},
{
"epoch": 0.08005202790587679,
"grad_norm": 0.1928778886795044,
"learning_rate": 1.1477422304981104e-05,
"loss": 0.3455,
"num_tokens": 428783253.0,
"step": 677
},
{
"epoch": 0.08017027314650585,
"grad_norm": 0.19302059710025787,
"learning_rate": 1.1415045388105477e-05,
"loss": 0.3846,
"num_tokens": 429419007.0,
"step": 678
},
{
"epoch": 0.08028851838713492,
"grad_norm": 0.190629780292511,
"learning_rate": 1.1352986072732943e-05,
"loss": 0.3779,
"num_tokens": 430051255.0,
"step": 679
},
{
"epoch": 0.08040676362776399,
"grad_norm": 0.22666533291339874,
"learning_rate": 1.1291245272005658e-05,
"loss": 0.4233,
"num_tokens": 430683994.0,
"step": 680
},
{
"epoch": 0.08052500886839305,
"grad_norm": 0.2007281333208084,
"learning_rate": 1.1229823894379133e-05,
"loss": 0.3534,
"num_tokens": 431314161.0,
"step": 681
},
{
"epoch": 0.08064325410902211,
"grad_norm": 0.2027830183506012,
"learning_rate": 1.1168722843608897e-05,
"loss": 0.3763,
"num_tokens": 431943550.0,
"step": 682
},
{
"epoch": 0.08076149934965117,
"grad_norm": 0.18750514090061188,
"learning_rate": 1.1107943018737158e-05,
"loss": 0.3677,
"num_tokens": 432580022.0,
"step": 683
},
{
"epoch": 0.08087974459028024,
"grad_norm": 0.20712460577487946,
"learning_rate": 1.104748531407962e-05,
"loss": 0.4149,
"num_tokens": 433215860.0,
"step": 684
},
{
"epoch": 0.0809979898309093,
"grad_norm": 0.21580064296722412,
"learning_rate": 1.0987350619212307e-05,
"loss": 0.3697,
"num_tokens": 433849766.0,
"step": 685
},
{
"epoch": 0.08111623507153837,
"grad_norm": 0.16628190875053406,
"learning_rate": 1.0927539818958437e-05,
"loss": 0.348,
"num_tokens": 434484743.0,
"step": 686
},
{
"epoch": 0.08123448031216743,
"grad_norm": 0.2060742974281311,
"learning_rate": 1.0868053793375467e-05,
"loss": 0.3591,
"num_tokens": 435100372.0,
"step": 687
},
{
"epoch": 0.0813527255527965,
"grad_norm": 0.2135939598083496,
"learning_rate": 1.0808893417742116e-05,
"loss": 0.4258,
"num_tokens": 435733891.0,
"step": 688
},
{
"epoch": 0.08147097079342557,
"grad_norm": 0.1941777616739273,
"learning_rate": 1.0750059562545451e-05,
"loss": 0.3644,
"num_tokens": 436365690.0,
"step": 689
},
{
"epoch": 0.08158921603405463,
"grad_norm": 0.17885838449001312,
"learning_rate": 1.0691553093468144e-05,
"loss": 0.3508,
"num_tokens": 437003639.0,
"step": 690
},
{
"epoch": 0.08170746127468369,
"grad_norm": 0.18553341925144196,
"learning_rate": 1.0633374871375666e-05,
"loss": 0.3832,
"num_tokens": 437642920.0,
"step": 691
},
{
"epoch": 0.08182570651531276,
"grad_norm": 0.2075071483850479,
"learning_rate": 1.0575525752303687e-05,
"loss": 0.3829,
"num_tokens": 438277063.0,
"step": 692
},
{
"epoch": 0.08194395175594182,
"grad_norm": 0.21307510137557983,
"learning_rate": 1.0518006587445431e-05,
"loss": 0.3931,
"num_tokens": 438915083.0,
"step": 693
},
{
"epoch": 0.08206219699657089,
"grad_norm": 0.20583738386631012,
"learning_rate": 1.0460818223139167e-05,
"loss": 0.4053,
"num_tokens": 439554233.0,
"step": 694
},
{
"epoch": 0.08218044223719995,
"grad_norm": 0.17440171539783478,
"learning_rate": 1.0403961500855766e-05,
"loss": 0.359,
"num_tokens": 440187716.0,
"step": 695
},
{
"epoch": 0.08229868747782902,
"grad_norm": 0.1777043342590332,
"learning_rate": 1.0347437257186311e-05,
"loss": 0.3862,
"num_tokens": 440823462.0,
"step": 696
},
{
"epoch": 0.08241693271845808,
"grad_norm": 0.18520857393741608,
"learning_rate": 1.0291246323829772e-05,
"loss": 0.3751,
"num_tokens": 441461261.0,
"step": 697
},
{
"epoch": 0.08253517795908714,
"grad_norm": 0.2085760086774826,
"learning_rate": 1.0235389527580807e-05,
"loss": 0.3989,
"num_tokens": 442092406.0,
"step": 698
},
{
"epoch": 0.0826534231997162,
"grad_norm": 0.1899712234735489,
"learning_rate": 1.0179867690317546e-05,
"loss": 0.4033,
"num_tokens": 442729228.0,
"step": 699
},
{
"epoch": 0.08277166844034528,
"grad_norm": 0.19925859570503235,
"learning_rate": 1.0124681628989546e-05,
"loss": 0.416,
"num_tokens": 443368453.0,
"step": 700
},
{
"epoch": 0.08288991368097434,
"grad_norm": 0.1773071438074112,
"learning_rate": 1.006983215560575e-05,
"loss": 0.3633,
"num_tokens": 444004993.0,
"step": 701
},
{
"epoch": 0.0830081589216034,
"grad_norm": 0.20045937597751617,
"learning_rate": 1.001532007722252e-05,
"loss": 0.4294,
"num_tokens": 444641198.0,
"step": 702
},
{
"epoch": 0.08312640416223246,
"grad_norm": 0.18577006459236145,
"learning_rate": 9.9611461959318e-06,
"loss": 0.3833,
"num_tokens": 445272456.0,
"step": 703
},
{
"epoch": 0.08324464940286154,
"grad_norm": 0.21089830994606018,
"learning_rate": 9.907311308849286e-06,
"loss": 0.4268,
"num_tokens": 445909612.0,
"step": 704
},
{
"epoch": 0.0833628946434906,
"grad_norm": 0.22879935801029205,
"learning_rate": 9.853816208102698e-06,
"loss": 0.4456,
"num_tokens": 446544323.0,
"step": 705
},
{
"epoch": 0.08348113988411966,
"grad_norm": 0.1861100196838379,
"learning_rate": 9.800661680820146e-06,
"loss": 0.3963,
"num_tokens": 447177697.0,
"step": 706
},
{
"epoch": 0.08359938512474872,
"grad_norm": 0.21287429332733154,
"learning_rate": 9.747848509118531e-06,
"loss": 0.4048,
"num_tokens": 447813578.0,
"step": 707
},
{
"epoch": 0.0837176303653778,
"grad_norm": 0.19029271602630615,
"learning_rate": 9.69537747009204e-06,
"loss": 0.3696,
"num_tokens": 448448247.0,
"step": 708
},
{
"epoch": 0.08383587560600686,
"grad_norm": 0.19157418608665466,
"learning_rate": 9.643249335800701e-06,
"loss": 0.3907,
"num_tokens": 449081260.0,
"step": 709
},
{
"epoch": 0.08395412084663592,
"grad_norm": 0.1897335648536682,
"learning_rate": 9.591464873259048e-06,
"loss": 0.3519,
"num_tokens": 449718960.0,
"step": 710
},
{
"epoch": 0.08407236608726498,
"grad_norm": 0.20246022939682007,
"learning_rate": 9.540024844424825e-06,
"loss": 0.3647,
"num_tokens": 450354221.0,
"step": 711
},
{
"epoch": 0.08419061132789406,
"grad_norm": 0.22009633481502533,
"learning_rate": 9.48893000618775e-06,
"loss": 0.4162,
"num_tokens": 450990864.0,
"step": 712
},
{
"epoch": 0.08430885656852312,
"grad_norm": 0.17846493422985077,
"learning_rate": 9.438181110358414e-06,
"loss": 0.347,
"num_tokens": 451629963.0,
"step": 713
},
{
"epoch": 0.08442710180915218,
"grad_norm": 0.17807744443416595,
"learning_rate": 9.387778903657208e-06,
"loss": 0.3508,
"num_tokens": 452263375.0,
"step": 714
},
{
"epoch": 0.08454534704978124,
"grad_norm": 0.2217930108308792,
"learning_rate": 9.337724127703315e-06,
"loss": 0.4266,
"num_tokens": 452899788.0,
"step": 715
},
{
"epoch": 0.08466359229041032,
"grad_norm": 0.17611801624298096,
"learning_rate": 9.288017519003827e-06,
"loss": 0.3527,
"num_tokens": 453532800.0,
"step": 716
},
{
"epoch": 0.08478183753103938,
"grad_norm": 0.18967418372631073,
"learning_rate": 9.2386598089429e-06,
"loss": 0.4013,
"num_tokens": 454167051.0,
"step": 717
},
{
"epoch": 0.08490008277166844,
"grad_norm": 0.18361113965511322,
"learning_rate": 9.189651723770968e-06,
"loss": 0.3954,
"num_tokens": 454801891.0,
"step": 718
},
{
"epoch": 0.0850183280122975,
"grad_norm": 0.18380604684352875,
"learning_rate": 9.140993984594098e-06,
"loss": 0.3798,
"num_tokens": 455434940.0,
"step": 719
},
{
"epoch": 0.08513657325292658,
"grad_norm": 0.2047707885503769,
"learning_rate": 9.092687307363336e-06,
"loss": 0.4165,
"num_tokens": 456070522.0,
"step": 720
},
{
"epoch": 0.08525481849355564,
"grad_norm": 0.1952807605266571,
"learning_rate": 9.044732402864214e-06,
"loss": 0.4127,
"num_tokens": 456700607.0,
"step": 721
},
{
"epoch": 0.0853730637341847,
"grad_norm": 0.20445430278778076,
"learning_rate": 8.997129976706273e-06,
"loss": 0.3739,
"num_tokens": 457333591.0,
"step": 722
},
{
"epoch": 0.08549130897481376,
"grad_norm": 0.18014107644557953,
"learning_rate": 8.949880729312658e-06,
"loss": 0.3939,
"num_tokens": 457972538.0,
"step": 723
},
{
"epoch": 0.08560955421544283,
"grad_norm": 0.18680702149868011,
"learning_rate": 8.902985355909854e-06,
"loss": 0.3814,
"num_tokens": 458608333.0,
"step": 724
},
{
"epoch": 0.0857277994560719,
"grad_norm": 0.1889398694038391,
"learning_rate": 8.856444546517439e-06,
"loss": 0.3846,
"num_tokens": 459238593.0,
"step": 725
},
{
"epoch": 0.08584604469670096,
"grad_norm": 0.1750420778989792,
"learning_rate": 8.810258985937902e-06,
"loss": 0.3657,
"num_tokens": 459848240.0,
"step": 726
},
{
"epoch": 0.08596428993733002,
"grad_norm": 0.2158506065607071,
"learning_rate": 8.764429353746627e-06,
"loss": 0.4134,
"num_tokens": 460483298.0,
"step": 727
},
{
"epoch": 0.0860825351779591,
"grad_norm": 0.1906134933233261,
"learning_rate": 8.71895632428183e-06,
"loss": 0.3773,
"num_tokens": 461109500.0,
"step": 728
},
{
"epoch": 0.08620078041858815,
"grad_norm": 0.193019300699234,
"learning_rate": 8.673840566634688e-06,
"loss": 0.3787,
"num_tokens": 461746594.0,
"step": 729
},
{
"epoch": 0.08631902565921722,
"grad_norm": 0.18906846642494202,
"learning_rate": 8.629082744639463e-06,
"loss": 0.3829,
"num_tokens": 462380799.0,
"step": 730
},
{
"epoch": 0.08643727089984628,
"grad_norm": 0.18636515736579895,
"learning_rate": 8.584683516863736e-06,
"loss": 0.3875,
"num_tokens": 463016862.0,
"step": 731
},
{
"epoch": 0.08655551614047535,
"grad_norm": 0.17957797646522522,
"learning_rate": 8.540643536598749e-06,
"loss": 0.3563,
"num_tokens": 463650306.0,
"step": 732
},
{
"epoch": 0.08667376138110441,
"grad_norm": 0.181325301527977,
"learning_rate": 8.496963451849745e-06,
"loss": 0.3773,
"num_tokens": 464282371.0,
"step": 733
},
{
"epoch": 0.08679200662173348,
"grad_norm": 0.19648700952529907,
"learning_rate": 8.453643905326459e-06,
"loss": 0.3687,
"num_tokens": 464918493.0,
"step": 734
},
{
"epoch": 0.08691025186236254,
"grad_norm": 0.19785350561141968,
"learning_rate": 8.410685534433676e-06,
"loss": 0.3761,
"num_tokens": 465551682.0,
"step": 735
},
{
"epoch": 0.0870284971029916,
"grad_norm": 0.1787901520729065,
"learning_rate": 8.368088971261814e-06,
"loss": 0.3737,
"num_tokens": 466189560.0,
"step": 736
},
{
"epoch": 0.08714674234362067,
"grad_norm": 0.19740906357765198,
"learning_rate": 8.32585484257766e-06,
"loss": 0.3781,
"num_tokens": 466826010.0,
"step": 737
},
{
"epoch": 0.08726498758424973,
"grad_norm": 0.18967872858047485,
"learning_rate": 8.28398376981511e-06,
"loss": 0.3477,
"num_tokens": 467461700.0,
"step": 738
},
{
"epoch": 0.0873832328248788,
"grad_norm": 0.16891902685165405,
"learning_rate": 8.242476369066072e-06,
"loss": 0.3352,
"num_tokens": 468097256.0,
"step": 739
},
{
"epoch": 0.08750147806550786,
"grad_norm": 0.2073381245136261,
"learning_rate": 8.20133325107137e-06,
"loss": 0.4052,
"num_tokens": 468731535.0,
"step": 740
},
{
"epoch": 0.08761972330613693,
"grad_norm": 0.18397468328475952,
"learning_rate": 8.160555021211748e-06,
"loss": 0.3544,
"num_tokens": 469363357.0,
"step": 741
},
{
"epoch": 0.087737968546766,
"grad_norm": 0.19281727075576782,
"learning_rate": 8.12014227949899e-06,
"loss": 0.3782,
"num_tokens": 469996228.0,
"step": 742
},
{
"epoch": 0.08785621378739505,
"grad_norm": 0.20584794878959656,
"learning_rate": 8.080095620567093e-06,
"loss": 0.4069,
"num_tokens": 470628575.0,
"step": 743
},
{
"epoch": 0.08797445902802412,
"grad_norm": 0.18428972363471985,
"learning_rate": 8.040415633663469e-06,
"loss": 0.3892,
"num_tokens": 471265485.0,
"step": 744
},
{
"epoch": 0.08809270426865319,
"grad_norm": 0.1747061312198639,
"learning_rate": 8.001102902640344e-06,
"loss": 0.3767,
"num_tokens": 471898145.0,
"step": 745
},
{
"epoch": 0.08821094950928225,
"grad_norm": 0.18705062568187714,
"learning_rate": 7.962158005946105e-06,
"loss": 0.3754,
"num_tokens": 472533209.0,
"step": 746
},
{
"epoch": 0.08832919474991131,
"grad_norm": 0.18788328766822815,
"learning_rate": 7.923581516616837e-06,
"loss": 0.3855,
"num_tokens": 473171790.0,
"step": 747
},
{
"epoch": 0.08844743999054037,
"grad_norm": 0.18790322542190552,
"learning_rate": 7.88537400226787e-06,
"loss": 0.3487,
"num_tokens": 473806600.0,
"step": 748
},
{
"epoch": 0.08856568523116945,
"grad_norm": 0.18705305457115173,
"learning_rate": 7.847536025085408e-06,
"loss": 0.3834,
"num_tokens": 474446221.0,
"step": 749
},
{
"epoch": 0.08868393047179851,
"grad_norm": 0.1689257174730301,
"learning_rate": 7.810068141818299e-06,
"loss": 0.3533,
"num_tokens": 475080946.0,
"step": 750
},
{
"epoch": 0.08880217571242757,
"grad_norm": 0.18348811566829681,
"learning_rate": 7.772970903769814e-06,
"loss": 0.3248,
"num_tokens": 475715589.0,
"step": 751
},
{
"epoch": 0.08892042095305663,
"grad_norm": 0.194603830575943,
"learning_rate": 7.736244856789531e-06,
"loss": 0.3856,
"num_tokens": 476350099.0,
"step": 752
},
{
"epoch": 0.08903866619368571,
"grad_norm": 0.19097204506397247,
"learning_rate": 7.69989054126533e-06,
"loss": 0.3998,
"num_tokens": 476986608.0,
"step": 753
},
{
"epoch": 0.08915691143431477,
"grad_norm": 0.18063834309577942,
"learning_rate": 7.663908492115426e-06,
"loss": 0.3828,
"num_tokens": 477626286.0,
"step": 754
},
{
"epoch": 0.08927515667494383,
"grad_norm": 0.1803908348083496,
"learning_rate": 7.628299238780476e-06,
"loss": 0.3851,
"num_tokens": 478262327.0,
"step": 755
},
{
"epoch": 0.08939340191557289,
"grad_norm": 0.18068207800388336,
"learning_rate": 7.59306330521584e-06,
"loss": 0.3602,
"num_tokens": 478899878.0,
"step": 756
},
{
"epoch": 0.08951164715620197,
"grad_norm": 0.1799282282590866,
"learning_rate": 7.558201209883818e-06,
"loss": 0.3743,
"num_tokens": 479538362.0,
"step": 757
},
{
"epoch": 0.08962989239683103,
"grad_norm": 0.1710379421710968,
"learning_rate": 7.523713465746072e-06,
"loss": 0.3626,
"num_tokens": 480177217.0,
"step": 758
},
{
"epoch": 0.08974813763746009,
"grad_norm": 0.18254569172859192,
"learning_rate": 7.489600580256027e-06,
"loss": 0.3839,
"num_tokens": 480809891.0,
"step": 759
},
{
"epoch": 0.08986638287808915,
"grad_norm": 0.19266051054000854,
"learning_rate": 7.455863055351445e-06,
"loss": 0.3762,
"num_tokens": 481446104.0,
"step": 760
},
{
"epoch": 0.08998462811871823,
"grad_norm": 0.16768276691436768,
"learning_rate": 7.422501387447021e-06,
"loss": 0.3582,
"num_tokens": 482084578.0,
"step": 761
},
{
"epoch": 0.09010287335934729,
"grad_norm": 0.18206870555877686,
"learning_rate": 7.389516067427073e-06,
"loss": 0.3688,
"num_tokens": 482713767.0,
"step": 762
},
{
"epoch": 0.09022111859997635,
"grad_norm": 0.21701638400554657,
"learning_rate": 7.356907580638336e-06,
"loss": 0.436,
"num_tokens": 483353280.0,
"step": 763
},
{
"epoch": 0.09033936384060541,
"grad_norm": 0.15846391022205353,
"learning_rate": 7.324676406882817e-06,
"loss": 0.3657,
"num_tokens": 483985107.0,
"step": 764
},
{
"epoch": 0.09045760908123449,
"grad_norm": 0.21575696766376495,
"learning_rate": 7.2928230204107194e-06,
"loss": 0.3862,
"num_tokens": 484615672.0,
"step": 765
},
{
"epoch": 0.09057585432186355,
"grad_norm": 0.19652079045772552,
"learning_rate": 7.261347889913485e-06,
"loss": 0.3826,
"num_tokens": 485253394.0,
"step": 766
},
{
"epoch": 0.09069409956249261,
"grad_norm": 0.1919373869895935,
"learning_rate": 7.230251478516881e-06,
"loss": 0.3903,
"num_tokens": 485886884.0,
"step": 767
},
{
"epoch": 0.09081234480312167,
"grad_norm": 0.21124163269996643,
"learning_rate": 7.199534243774199e-06,
"loss": 0.3766,
"num_tokens": 486516495.0,
"step": 768
},
{
"epoch": 0.09093059004375074,
"grad_norm": 0.18964266777038574,
"learning_rate": 7.169196637659522e-06,
"loss": 0.4244,
"num_tokens": 487151670.0,
"step": 769
},
{
"epoch": 0.0910488352843798,
"grad_norm": 0.20490339398384094,
"learning_rate": 7.139239106561053e-06,
"loss": 0.3828,
"num_tokens": 487786678.0,
"step": 770
},
{
"epoch": 0.09116708052500887,
"grad_norm": 0.20041170716285706,
"learning_rate": 7.109662091274574e-06,
"loss": 0.3998,
"num_tokens": 488423430.0,
"step": 771
},
{
"epoch": 0.09128532576563793,
"grad_norm": 0.17842328548431396,
"learning_rate": 7.080466026996954e-06,
"loss": 0.3712,
"num_tokens": 489055057.0,
"step": 772
},
{
"epoch": 0.091403571006267,
"grad_norm": 0.18228091299533844,
"learning_rate": 7.051651343319723e-06,
"loss": 0.3632,
"num_tokens": 489690318.0,
"step": 773
},
{
"epoch": 0.09152181624689606,
"grad_norm": 0.19914202392101288,
"learning_rate": 7.023218464222788e-06,
"loss": 0.4109,
"num_tokens": 490315503.0,
"step": 774
},
{
"epoch": 0.09164006148752513,
"grad_norm": 0.1682100147008896,
"learning_rate": 6.995167808068159e-06,
"loss": 0.3356,
"num_tokens": 490951658.0,
"step": 775
},
{
"epoch": 0.09175830672815419,
"grad_norm": 0.18745863437652588,
"learning_rate": 6.9674997875938175e-06,
"loss": 0.3389,
"num_tokens": 491582936.0,
"step": 776
},
{
"epoch": 0.09187655196878326,
"grad_norm": 0.18999552726745605,
"learning_rate": 6.940214809907637e-06,
"loss": 0.4062,
"num_tokens": 492221808.0,
"step": 777
},
{
"epoch": 0.09199479720941232,
"grad_norm": 0.20237648487091064,
"learning_rate": 6.913313276481378e-06,
"loss": 0.3851,
"num_tokens": 492851928.0,
"step": 778
},
{
"epoch": 0.09211304245004139,
"grad_norm": 0.18820145726203918,
"learning_rate": 6.886795583144813e-06,
"loss": 0.3408,
"num_tokens": 493484521.0,
"step": 779
},
{
"epoch": 0.09223128769067045,
"grad_norm": 0.2060716450214386,
"learning_rate": 6.860662120079868e-06,
"loss": 0.4156,
"num_tokens": 494120278.0,
"step": 780
},
{
"epoch": 0.09234953293129952,
"grad_norm": 0.1769654005765915,
"learning_rate": 6.834913271814898e-06,
"loss": 0.4094,
"num_tokens": 494748375.0,
"step": 781
},
{
"epoch": 0.09246777817192858,
"grad_norm": 0.2025490701198578,
"learning_rate": 6.809549417219036e-06,
"loss": 0.3979,
"num_tokens": 495383913.0,
"step": 782
},
{
"epoch": 0.09258602341255764,
"grad_norm": 0.1910087913274765,
"learning_rate": 6.784570929496596e-06,
"loss": 0.3656,
"num_tokens": 496008813.0,
"step": 783
},
{
"epoch": 0.0927042686531867,
"grad_norm": 0.18994437158107758,
"learning_rate": 6.759978176181609e-06,
"loss": 0.3939,
"num_tokens": 496639648.0,
"step": 784
},
{
"epoch": 0.09282251389381578,
"grad_norm": 0.1858188956975937,
"learning_rate": 6.7357715191323985e-06,
"loss": 0.3416,
"num_tokens": 497274171.0,
"step": 785
},
{
"epoch": 0.09294075913444484,
"grad_norm": 0.17720621824264526,
"learning_rate": 6.711951314526245e-06,
"loss": 0.3714,
"num_tokens": 497913138.0,
"step": 786
},
{
"epoch": 0.0930590043750739,
"grad_norm": 0.18589583039283752,
"learning_rate": 6.688517912854183e-06,
"loss": 0.4066,
"num_tokens": 498551639.0,
"step": 787
},
{
"epoch": 0.09317724961570296,
"grad_norm": 0.19449126720428467,
"learning_rate": 6.665471658915793e-06,
"loss": 0.3974,
"num_tokens": 499182979.0,
"step": 788
},
{
"epoch": 0.09329549485633203,
"grad_norm": 0.20465314388275146,
"learning_rate": 6.642812891814178e-06,
"loss": 0.3752,
"num_tokens": 499817574.0,
"step": 789
},
{
"epoch": 0.0934137400969611,
"grad_norm": 0.20144398510456085,
"learning_rate": 6.620541944950941e-06,
"loss": 0.4221,
"num_tokens": 500450987.0,
"step": 790
},
{
"epoch": 0.09353198533759016,
"grad_norm": 0.20738765597343445,
"learning_rate": 6.598659146021286e-06,
"loss": 0.4083,
"num_tokens": 501086255.0,
"step": 791
},
{
"epoch": 0.09365023057821922,
"grad_norm": 0.18912115693092346,
"learning_rate": 6.577164817009207e-06,
"loss": 0.375,
"num_tokens": 501724060.0,
"step": 792
},
{
"epoch": 0.09376847581884828,
"grad_norm": 0.17531508207321167,
"learning_rate": 6.556059274182744e-06,
"loss": 0.3698,
"num_tokens": 502336426.0,
"step": 793
},
{
"epoch": 0.09388672105947736,
"grad_norm": 0.2189079225063324,
"learning_rate": 6.535342828089317e-06,
"loss": 0.4016,
"num_tokens": 502970977.0,
"step": 794
},
{
"epoch": 0.09400496630010642,
"grad_norm": 0.20233234763145447,
"learning_rate": 6.515015783551183e-06,
"loss": 0.332,
"num_tokens": 503604914.0,
"step": 795
},
{
"epoch": 0.09412321154073548,
"grad_norm": 0.19407616555690765,
"learning_rate": 6.495078439660918e-06,
"loss": 0.3673,
"num_tokens": 504241729.0,
"step": 796
},
{
"epoch": 0.09424145678136454,
"grad_norm": 0.19280032813549042,
"learning_rate": 6.475531089777052e-06,
"loss": 0.3671,
"num_tokens": 504874311.0,
"step": 797
},
{
"epoch": 0.09435970202199362,
"grad_norm": 0.1831720918416977,
"learning_rate": 6.456374021519726e-06,
"loss": 0.3887,
"num_tokens": 505507864.0,
"step": 798
},
{
"epoch": 0.09447794726262268,
"grad_norm": 0.19168098270893097,
"learning_rate": 6.4376075167664654e-06,
"loss": 0.3912,
"num_tokens": 506141369.0,
"step": 799
},
{
"epoch": 0.09459619250325174,
"grad_norm": 0.18384189903736115,
"learning_rate": 6.419231851648044e-06,
"loss": 0.3676,
"num_tokens": 506777001.0,
"step": 800
},
{
"epoch": 0.0947144377438808,
"grad_norm": 0.16950486600399017,
"learning_rate": 6.401247296544408e-06,
"loss": 0.3298,
"num_tokens": 507406711.0,
"step": 801
},
{
"epoch": 0.09483268298450988,
"grad_norm": 0.1815839558839798,
"learning_rate": 6.383654116080699e-06,
"loss": 0.3838,
"num_tokens": 508042267.0,
"step": 802
},
{
"epoch": 0.09495092822513894,
"grad_norm": 0.18775033950805664,
"learning_rate": 6.366452569123366e-06,
"loss": 0.3549,
"num_tokens": 508675609.0,
"step": 803
},
{
"epoch": 0.095069173465768,
"grad_norm": 0.17260177433490753,
"learning_rate": 6.3496429087763535e-06,
"loss": 0.3564,
"num_tokens": 509312065.0,
"step": 804
},
{
"epoch": 0.09518741870639706,
"grad_norm": 0.1801680028438568,
"learning_rate": 6.333225382377383e-06,
"loss": 0.3679,
"num_tokens": 509946717.0,
"step": 805
},
{
"epoch": 0.09530566394702614,
"grad_norm": 0.1752161681652069,
"learning_rate": 6.3172002314943e-06,
"loss": 0.3705,
"num_tokens": 510583518.0,
"step": 806
},
{
"epoch": 0.0954239091876552,
"grad_norm": 0.19134798645973206,
"learning_rate": 6.30156769192153e-06,
"loss": 0.3984,
"num_tokens": 511221107.0,
"step": 807
},
{
"epoch": 0.09554215442828426,
"grad_norm": 0.1874755322933197,
"learning_rate": 6.286327993676615e-06,
"loss": 0.3846,
"num_tokens": 511860697.0,
"step": 808
},
{
"epoch": 0.09566039966891332,
"grad_norm": 0.17865346372127533,
"learning_rate": 6.271481360996808e-06,
"loss": 0.3737,
"num_tokens": 512498128.0,
"step": 809
},
{
"epoch": 0.0957786449095424,
"grad_norm": 0.19846026599407196,
"learning_rate": 6.257028012335795e-06,
"loss": 0.4089,
"num_tokens": 513128610.0,
"step": 810
},
{
"epoch": 0.09589689015017146,
"grad_norm": 0.16993194818496704,
"learning_rate": 6.2429681603604726e-06,
"loss": 0.3392,
"num_tokens": 513765105.0,
"step": 811
},
{
"epoch": 0.09601513539080052,
"grad_norm": 0.17878930270671844,
"learning_rate": 6.229302011947814e-06,
"loss": 0.3964,
"num_tokens": 514394034.0,
"step": 812
},
{
"epoch": 0.09613338063142958,
"grad_norm": 0.18822607398033142,
"learning_rate": 6.2160297681818316e-06,
"loss": 0.3763,
"num_tokens": 515033384.0,
"step": 813
},
{
"epoch": 0.09625162587205865,
"grad_norm": 0.19209401309490204,
"learning_rate": 6.2031516243506175e-06,
"loss": 0.3585,
"num_tokens": 515667789.0,
"step": 814
},
{
"epoch": 0.09636987111268772,
"grad_norm": 0.19187025725841522,
"learning_rate": 6.190667769943463e-06,
"loss": 0.3625,
"num_tokens": 516301878.0,
"step": 815
},
{
"epoch": 0.09648811635331678,
"grad_norm": 0.17314016819000244,
"learning_rate": 6.178578388648084e-06,
"loss": 0.3548,
"num_tokens": 516936923.0,
"step": 816
},
{
"epoch": 0.09660636159394584,
"grad_norm": 0.19279181957244873,
"learning_rate": 6.166883658347904e-06,
"loss": 0.4,
"num_tokens": 517574893.0,
"step": 817
},
{
"epoch": 0.09672460683457491,
"grad_norm": 0.1702749878168106,
"learning_rate": 6.155583751119448e-06,
"loss": 0.3694,
"num_tokens": 518213624.0,
"step": 818
},
{
"epoch": 0.09684285207520398,
"grad_norm": 0.1792595386505127,
"learning_rate": 6.1446788332298e-06,
"loss": 0.3538,
"num_tokens": 518852531.0,
"step": 819
},
{
"epoch": 0.09696109731583304,
"grad_norm": 0.18162083625793457,
"learning_rate": 6.134169065134162e-06,
"loss": 0.3896,
"num_tokens": 519492204.0,
"step": 820
},
{
"epoch": 0.0970793425564621,
"grad_norm": 0.18663813173770905,
"learning_rate": 6.124054601473502e-06,
"loss": 0.3965,
"num_tokens": 520130296.0,
"step": 821
},
{
"epoch": 0.09719758779709117,
"grad_norm": 0.1922474354505539,
"learning_rate": 6.114335591072261e-06,
"loss": 0.3621,
"num_tokens": 520765986.0,
"step": 822
},
{
"epoch": 0.09731583303772023,
"grad_norm": 0.236387699842453,
"learning_rate": 6.105012176936177e-06,
"loss": 0.4225,
"num_tokens": 521400644.0,
"step": 823
},
{
"epoch": 0.0974340782783493,
"grad_norm": 0.17774070799350739,
"learning_rate": 6.096084496250168e-06,
"loss": 0.364,
"num_tokens": 522039463.0,
"step": 824
},
{
"epoch": 0.09755232351897836,
"grad_norm": 0.18863226473331451,
"learning_rate": 6.087552680376332e-06,
"loss": 0.3668,
"num_tokens": 522671508.0,
"step": 825
},
{
"epoch": 0.09767056875960743,
"grad_norm": 0.19288307428359985,
"learning_rate": 6.079416854851993e-06,
"loss": 0.3596,
"num_tokens": 523311225.0,
"step": 826
},
{
"epoch": 0.0977888140002365,
"grad_norm": 0.1993461400270462,
"learning_rate": 6.071677139387874e-06,
"loss": 0.3414,
"num_tokens": 523949133.0,
"step": 827
},
{
"epoch": 0.09790705924086555,
"grad_norm": 0.18140719830989838,
"learning_rate": 6.064333647866317e-06,
"loss": 0.3793,
"num_tokens": 524577955.0,
"step": 828
},
{
"epoch": 0.09802530448149462,
"grad_norm": 0.18989813327789307,
"learning_rate": 6.057386488339618e-06,
"loss": 0.3784,
"num_tokens": 525211514.0,
"step": 829
},
{
"epoch": 0.09814354972212369,
"grad_norm": 0.18679462373256683,
"learning_rate": 6.050835763028446e-06,
"loss": 0.4006,
"num_tokens": 525848086.0,
"step": 830
},
{
"epoch": 0.09826179496275275,
"grad_norm": 0.17804615199565887,
"learning_rate": 6.04468156832031e-06,
"loss": 0.3619,
"num_tokens": 526481722.0,
"step": 831
},
{
"epoch": 0.09838004020338181,
"grad_norm": 0.1832081377506256,
"learning_rate": 6.038923994768173e-06,
"loss": 0.3818,
"num_tokens": 527117956.0,
"step": 832
},
{
"epoch": 0.09849828544401087,
"grad_norm": 0.20609410107135773,
"learning_rate": 6.033563127089097e-06,
"loss": 0.4023,
"num_tokens": 527750234.0,
"step": 833
},
{
"epoch": 0.09861653068463995,
"grad_norm": 0.201175257563591,
"learning_rate": 6.02859904416301e-06,
"loss": 0.3745,
"num_tokens": 528386561.0,
"step": 834
},
{
"epoch": 0.09873477592526901,
"grad_norm": 0.20368118584156036,
"learning_rate": 6.024031819031541e-06,
"loss": 0.4117,
"num_tokens": 529021750.0,
"step": 835
},
{
"epoch": 0.09885302116589807,
"grad_norm": 0.18870466947555542,
"learning_rate": 6.019861518896941e-06,
"loss": 0.3533,
"num_tokens": 529661276.0,
"step": 836
},
{
"epoch": 0.09897126640652713,
"grad_norm": 0.2020527869462967,
"learning_rate": 6.016088205121099e-06,
"loss": 0.3947,
"num_tokens": 530297609.0,
"step": 837
},
{
"epoch": 0.09908951164715621,
"grad_norm": 0.18172025680541992,
"learning_rate": 6.012711933224636e-06,
"loss": 0.3672,
"num_tokens": 530933315.0,
"step": 838
},
{
"epoch": 0.09920775688778527,
"grad_norm": 0.1858338862657547,
"learning_rate": 6.009732752886096e-06,
"loss": 0.381,
"num_tokens": 531564788.0,
"step": 839
},
{
"epoch": 0.09932600212841433,
"grad_norm": 0.18906207382678986,
"learning_rate": 6.0071507079412e-06,
"loss": 0.384,
"num_tokens": 532193430.0,
"step": 840
},
{
"epoch": 0.09944424736904339,
"grad_norm": 0.1974787414073944,
"learning_rate": 6.004965836382215e-06,
"loss": 0.3912,
"num_tokens": 532828601.0,
"step": 841
},
{
"epoch": 0.09956249260967245,
"grad_norm": 0.18472707271575928,
"learning_rate": 6.003178170357397e-06,
"loss": 0.3508,
"num_tokens": 533466099.0,
"step": 842
},
{
"epoch": 0.09968073785030153,
"grad_norm": 0.17779730260372162,
"learning_rate": 6.001787736170496e-06,
"loss": 0.3865,
"num_tokens": 534102611.0,
"step": 843
},
{
"epoch": 0.09979898309093059,
"grad_norm": 0.19053196907043457,
"learning_rate": 6.000794554280395e-06,
"loss": 0.3733,
"num_tokens": 534731488.0,
"step": 844
},
{
"epoch": 0.09991722833155965,
"grad_norm": 0.1939527839422226,
"learning_rate": 6.0001986393007945e-06,
"loss": 0.3785,
"num_tokens": 535370116.0,
"step": 845
}
],
"logging_steps": 1.0,
"max_steps": 845,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 845,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.527336137257124e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}