Sathvik0101's picture
Upload cyber-duel-tiny LoRA adapter (SFT)
136ad6a verified
Raw
History Blame Contribute Delete
68.8 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.548675110936165,
"epoch": 0.013333333333333334,
"grad_norm": 6.004289150238037,
"learning_rate": 1.688888888888889e-05,
"loss": 3.992266082763672,
"mean_token_accuracy": 0.4231670804321766,
"num_tokens": 102186.0,
"step": 20
},
{
"entropy": 2.2785673171281813,
"epoch": 0.02666666666666667,
"grad_norm": 3.0929577350616455,
"learning_rate": 3.466666666666667e-05,
"loss": 2.8258544921875,
"mean_token_accuracy": 0.5193272314965725,
"num_tokens": 204333.0,
"step": 40
},
{
"entropy": 1.6070524707436562,
"epoch": 0.04,
"grad_norm": 2.344675302505493,
"learning_rate": 5.244444444444445e-05,
"loss": 1.4720239639282227,
"mean_token_accuracy": 0.7221725225448609,
"num_tokens": 306198.0,
"step": 60
},
{
"entropy": 0.4960472501814365,
"epoch": 0.05333333333333334,
"grad_norm": 2.080559730529785,
"learning_rate": 7.022222222222222e-05,
"loss": 0.4806540012359619,
"mean_token_accuracy": 0.9012673273682594,
"num_tokens": 408035.0,
"step": 80
},
{
"entropy": 0.1692034611478448,
"epoch": 0.06666666666666667,
"grad_norm": 1.5278408527374268,
"learning_rate": 8.800000000000001e-05,
"loss": 0.1592921018600464,
"mean_token_accuracy": 0.9599622413516045,
"num_tokens": 509962.0,
"step": 100
},
{
"entropy": 0.11861470770090818,
"epoch": 0.08,
"grad_norm": 0.999698281288147,
"learning_rate": 0.00010577777777777777,
"loss": 0.11052950620651245,
"mean_token_accuracy": 0.9685859054327011,
"num_tokens": 611562.0,
"step": 120
},
{
"entropy": 0.1036016432568431,
"epoch": 0.09333333333333334,
"grad_norm": 0.9115886092185974,
"learning_rate": 0.00012355555555555557,
"loss": 0.0914052426815033,
"mean_token_accuracy": 0.9704682394862175,
"num_tokens": 713684.0,
"step": 140
},
{
"entropy": 0.09701150320470334,
"epoch": 0.10666666666666667,
"grad_norm": 0.6500758528709412,
"learning_rate": 0.00014133333333333334,
"loss": 0.08168401718139648,
"mean_token_accuracy": 0.9728480890393257,
"num_tokens": 815728.0,
"step": 160
},
{
"entropy": 0.0902867017313838,
"epoch": 0.12,
"grad_norm": 0.4816068112850189,
"learning_rate": 0.00015911111111111112,
"loss": 0.0673690140247345,
"mean_token_accuracy": 0.9743824899196625,
"num_tokens": 917588.0,
"step": 180
},
{
"entropy": 0.07180177625268698,
"epoch": 0.13333333333333333,
"grad_norm": 0.42101994156837463,
"learning_rate": 0.0001768888888888889,
"loss": 0.05838126540184021,
"mean_token_accuracy": 0.975058288872242,
"num_tokens": 1020041.0,
"step": 200
},
{
"entropy": 0.06279958104714752,
"epoch": 0.14666666666666667,
"grad_norm": 0.41531553864479065,
"learning_rate": 0.0001946666666666667,
"loss": 0.05505728721618652,
"mean_token_accuracy": 0.976372754573822,
"num_tokens": 1121933.0,
"step": 220
},
{
"entropy": 0.05904992977157235,
"epoch": 0.16,
"grad_norm": 0.5666757822036743,
"learning_rate": 0.00019999470763544457,
"loss": 0.052491378784179685,
"mean_token_accuracy": 0.9762984499335289,
"num_tokens": 1223670.0,
"step": 240
},
{
"entropy": 0.05695097530260682,
"epoch": 0.17333333333333334,
"grad_norm": 0.39107683300971985,
"learning_rate": 0.00019996878719840213,
"loss": 0.05221613645553589,
"mean_token_accuracy": 0.9769444420933724,
"num_tokens": 1325903.0,
"step": 260
},
{
"entropy": 0.05454709641635418,
"epoch": 0.18666666666666668,
"grad_norm": 0.2881831228733063,
"learning_rate": 0.00019992127221406275,
"loss": 0.05105168223381042,
"mean_token_accuracy": 0.9766697883605957,
"num_tokens": 1427883.0,
"step": 280
},
{
"entropy": 0.05568597661331296,
"epoch": 0.2,
"grad_norm": 0.2969810962677002,
"learning_rate": 0.00019985217294627577,
"loss": 0.05190561413764953,
"mean_token_accuracy": 0.9768449172377587,
"num_tokens": 1529850.0,
"step": 300
},
{
"entropy": 0.05605392120778561,
"epoch": 0.21333333333333335,
"grad_norm": 0.39327648282051086,
"learning_rate": 0.00019976150432137423,
"loss": 0.05125090479850769,
"mean_token_accuracy": 0.9767352715134621,
"num_tokens": 1631796.0,
"step": 320
},
{
"entropy": 0.05631188191473484,
"epoch": 0.22666666666666666,
"grad_norm": 0.2569703757762909,
"learning_rate": 0.00019964928592495045,
"loss": 0.05136184692382813,
"mean_token_accuracy": 0.9767047330737114,
"num_tokens": 1733431.0,
"step": 340
},
{
"entropy": 0.054749509692192076,
"epoch": 0.24,
"grad_norm": 0.2503352761268616,
"learning_rate": 0.00019951554199762526,
"loss": 0.04927194118499756,
"mean_token_accuracy": 0.9772127717733383,
"num_tokens": 1835736.0,
"step": 360
},
{
"entropy": 0.053956403583288196,
"epoch": 0.25333333333333335,
"grad_norm": 0.26568838953971863,
"learning_rate": 0.00019936030142981182,
"loss": 0.04831983149051666,
"mean_token_accuracy": 0.9772727772593498,
"num_tokens": 1937395.0,
"step": 380
},
{
"entropy": 0.05297513753175735,
"epoch": 0.26666666666666666,
"grad_norm": 0.21782436966896057,
"learning_rate": 0.00019918359775547489,
"loss": 0.048703563213348386,
"mean_token_accuracy": 0.9776117220520973,
"num_tokens": 2039661.0,
"step": 400
},
{
"entropy": 0.05235615810379386,
"epoch": 0.28,
"grad_norm": 0.2456953078508377,
"learning_rate": 0.00019898546914488697,
"loss": 0.04742903709411621,
"mean_token_accuracy": 0.9779680415987968,
"num_tokens": 2141312.0,
"step": 420
},
{
"entropy": 0.05012538954615593,
"epoch": 0.29333333333333333,
"grad_norm": 0.17193332314491272,
"learning_rate": 0.00019876595839638314,
"loss": 0.04511936604976654,
"mean_token_accuracy": 0.978802102804184,
"num_tokens": 2243220.0,
"step": 440
},
{
"entropy": 0.050425101164728404,
"epoch": 0.30666666666666664,
"grad_norm": 0.19117344915866852,
"learning_rate": 0.00019852511292711608,
"loss": 0.04454375207424164,
"mean_token_accuracy": 0.9793910697102547,
"num_tokens": 2345110.0,
"step": 460
},
{
"entropy": 0.0502777012065053,
"epoch": 0.32,
"grad_norm": 0.1484805941581726,
"learning_rate": 0.0001982629847628132,
"loss": 0.045093965530395505,
"mean_token_accuracy": 0.9782336875796318,
"num_tokens": 2446814.0,
"step": 480
},
{
"entropy": 0.04916129466146231,
"epoch": 0.3333333333333333,
"grad_norm": 0.17659035325050354,
"learning_rate": 0.0001979796305265386,
"loss": 0.04536721706390381,
"mean_token_accuracy": 0.9788262486457825,
"num_tokens": 2548699.0,
"step": 500
},
{
"entropy": 0.04801498837769032,
"epoch": 0.3466666666666667,
"grad_norm": 0.18467392027378082,
"learning_rate": 0.0001976751114264616,
"loss": 0.04428495168685913,
"mean_token_accuracy": 0.9791656643152237,
"num_tokens": 2650925.0,
"step": 520
},
{
"entropy": 0.04973381711170077,
"epoch": 0.36,
"grad_norm": 0.22871969640254974,
"learning_rate": 0.0001973494932426351,
"loss": 0.04659122526645661,
"mean_token_accuracy": 0.9777900949120522,
"num_tokens": 2753152.0,
"step": 540
},
{
"entropy": 0.050069388933479786,
"epoch": 0.37333333333333335,
"grad_norm": 0.14215655624866486,
"learning_rate": 0.00019700284631278623,
"loss": 0.04543479979038238,
"mean_token_accuracy": 0.9784642964601517,
"num_tokens": 2855157.0,
"step": 560
},
{
"entropy": 0.048892225697636606,
"epoch": 0.38666666666666666,
"grad_norm": 0.14485321938991547,
"learning_rate": 0.00019663524551712236,
"loss": 0.043998023867607115,
"mean_token_accuracy": 0.9789358124136924,
"num_tokens": 2957430.0,
"step": 580
},
{
"entropy": 0.049546369817107916,
"epoch": 0.4,
"grad_norm": 0.1522541642189026,
"learning_rate": 0.0001962467702621562,
"loss": 0.04526585042476654,
"mean_token_accuracy": 0.9789461970329285,
"num_tokens": 3059857.0,
"step": 600
},
{
"entropy": 0.048749705869704486,
"epoch": 0.41333333333333333,
"grad_norm": 0.14776450395584106,
"learning_rate": 0.00019583750446355286,
"loss": 0.04488187730312347,
"mean_token_accuracy": 0.9790951684117317,
"num_tokens": 3161377.0,
"step": 620
},
{
"entropy": 0.04819442732259631,
"epoch": 0.4266666666666667,
"grad_norm": 0.155587837100029,
"learning_rate": 0.000195407536528003,
"loss": 0.04454294443130493,
"mean_token_accuracy": 0.9792696803808212,
"num_tokens": 3263597.0,
"step": 640
},
{
"entropy": 0.048739112261682746,
"epoch": 0.44,
"grad_norm": 0.24131548404693604,
"learning_rate": 0.0001949569593341258,
"loss": 0.04449517726898193,
"mean_token_accuracy": 0.9789462149143219,
"num_tokens": 3365773.0,
"step": 660
},
{
"entropy": 0.04729501772671938,
"epoch": 0.4533333333333333,
"grad_norm": 0.16851578652858734,
"learning_rate": 0.00019448587021240611,
"loss": 0.0436316579580307,
"mean_token_accuracy": 0.9790461182594299,
"num_tokens": 3467719.0,
"step": 680
},
{
"entropy": 0.048864346370100974,
"epoch": 0.4666666666666667,
"grad_norm": 0.17274609208106995,
"learning_rate": 0.00019399437092416967,
"loss": 0.04535620212554932,
"mean_token_accuracy": 0.9788791447877884,
"num_tokens": 3569559.0,
"step": 700
},
{
"entropy": 0.04898029724135995,
"epoch": 0.48,
"grad_norm": 0.13499416410923004,
"learning_rate": 0.00019348256763960145,
"loss": 0.045434945821762086,
"mean_token_accuracy": 0.9788094267249108,
"num_tokens": 3671491.0,
"step": 720
},
{
"entropy": 0.04580554729327559,
"epoch": 0.49333333333333335,
"grad_norm": 0.12506447732448578,
"learning_rate": 0.00019295057091481147,
"loss": 0.04356709420681,
"mean_token_accuracy": 0.9791021943092346,
"num_tokens": 3773051.0,
"step": 740
},
{
"entropy": 0.047521025873720646,
"epoch": 0.5066666666666667,
"grad_norm": 0.121482253074646,
"learning_rate": 0.00019239849566795323,
"loss": 0.044592976570129395,
"mean_token_accuracy": 0.9786569505929947,
"num_tokens": 3875663.0,
"step": 760
},
{
"entropy": 0.045532725658267735,
"epoch": 0.52,
"grad_norm": 0.13711974024772644,
"learning_rate": 0.00019182646115439996,
"loss": 0.042892631888389585,
"mean_token_accuracy": 0.979731023311615,
"num_tokens": 3977742.0,
"step": 780
},
{
"entropy": 0.04748789621517062,
"epoch": 0.5333333333333333,
"grad_norm": 0.126457080245018,
"learning_rate": 0.00019123459094098398,
"loss": 0.04508825838565826,
"mean_token_accuracy": 0.9783048242330551,
"num_tokens": 4079943.0,
"step": 800
},
{
"entropy": 0.045889181550592184,
"epoch": 0.5466666666666666,
"grad_norm": 0.12796172499656677,
"learning_rate": 0.00019062301287930446,
"loss": 0.04326332211494446,
"mean_token_accuracy": 0.979296863079071,
"num_tokens": 4181963.0,
"step": 820
},
{
"entropy": 0.045128315966576335,
"epoch": 0.56,
"grad_norm": 0.0813562199473381,
"learning_rate": 0.00018999185907811009,
"loss": 0.04314403533935547,
"mean_token_accuracy": 0.9794226452708245,
"num_tokens": 4283940.0,
"step": 840
},
{
"entropy": 0.04633188545703888,
"epoch": 0.5733333333333334,
"grad_norm": 0.13212576508522034,
"learning_rate": 0.00018934126587476162,
"loss": 0.04438722729682922,
"mean_token_accuracy": 0.9792284339666366,
"num_tokens": 4386033.0,
"step": 860
},
{
"entropy": 0.046954588033258915,
"epoch": 0.5866666666666667,
"grad_norm": 0.24543477594852448,
"learning_rate": 0.0001886713738057815,
"loss": 0.04496486783027649,
"mean_token_accuracy": 0.978602097928524,
"num_tokens": 4488033.0,
"step": 880
},
{
"entropy": 0.047627194225788115,
"epoch": 0.6,
"grad_norm": 0.15973004698753357,
"learning_rate": 0.000187982327576496,
"loss": 0.0447381466627121,
"mean_token_accuracy": 0.978855662047863,
"num_tokens": 4590393.0,
"step": 900
},
{
"entropy": 0.049009975790977475,
"epoch": 0.6133333333333333,
"grad_norm": 0.4588961899280548,
"learning_rate": 0.000187274276029777,
"loss": 0.04679847955703735,
"mean_token_accuracy": 0.9788309365510941,
"num_tokens": 4692314.0,
"step": 920
},
{
"entropy": 0.05283641302958131,
"epoch": 0.6266666666666667,
"grad_norm": 0.17900370061397552,
"learning_rate": 0.00018654737211389004,
"loss": 0.04886095821857452,
"mean_token_accuracy": 0.9779917612671852,
"num_tokens": 4794297.0,
"step": 940
},
{
"entropy": 0.05194324087351561,
"epoch": 0.64,
"grad_norm": 0.2685967683792114,
"learning_rate": 0.00018580177284945566,
"loss": 0.04925000071525574,
"mean_token_accuracy": 0.9787736907601357,
"num_tokens": 4896719.0,
"step": 960
},
{
"entropy": 0.04687528889626265,
"epoch": 0.6533333333333333,
"grad_norm": 0.3776164948940277,
"learning_rate": 0.0001850376392955307,
"loss": 0.04358056485652924,
"mean_token_accuracy": 0.9792398914694787,
"num_tokens": 4998801.0,
"step": 980
},
{
"entropy": 0.04969303589314222,
"epoch": 0.6666666666666666,
"grad_norm": 0.10363394021987915,
"learning_rate": 0.00018425513651481747,
"loss": 0.04642247259616852,
"mean_token_accuracy": 0.9783516511321068,
"num_tokens": 5100997.0,
"step": 1000
},
{
"entropy": 0.047921424824744464,
"epoch": 0.68,
"grad_norm": 0.1332525759935379,
"learning_rate": 0.00018345443353800839,
"loss": 0.04439827501773834,
"mean_token_accuracy": 0.9791212469339371,
"num_tokens": 5202682.0,
"step": 1020
},
{
"entropy": 0.047575213573873044,
"epoch": 0.6933333333333334,
"grad_norm": 0.08405883610248566,
"learning_rate": 0.00018263570332727275,
"loss": 0.043652302026748656,
"mean_token_accuracy": 0.9786113709211349,
"num_tokens": 5304249.0,
"step": 1040
},
{
"entropy": 0.04774442110210657,
"epoch": 0.7066666666666667,
"grad_norm": 0.09579049050807953,
"learning_rate": 0.00018179912273889501,
"loss": 0.043841779232025146,
"mean_token_accuracy": 0.9791841998696327,
"num_tokens": 5406457.0,
"step": 1060
},
{
"entropy": 0.04760089740157127,
"epoch": 0.72,
"grad_norm": 0.13812078535556793,
"learning_rate": 0.00018094487248507127,
"loss": 0.04469398260116577,
"mean_token_accuracy": 0.9787818253040313,
"num_tokens": 5508325.0,
"step": 1080
},
{
"entropy": 0.04628140116110444,
"epoch": 0.7333333333333333,
"grad_norm": 0.09030942618846893,
"learning_rate": 0.00018007313709487334,
"loss": 0.043077632784843445,
"mean_token_accuracy": 0.9798856094479561,
"num_tokens": 5609876.0,
"step": 1100
},
{
"entropy": 0.04589016325771809,
"epoch": 0.7466666666666667,
"grad_norm": 0.0854763314127922,
"learning_rate": 0.00017918410487438805,
"loss": 0.04384036958217621,
"mean_token_accuracy": 0.9791762813925743,
"num_tokens": 5712340.0,
"step": 1120
},
{
"entropy": 0.04689710335806012,
"epoch": 0.76,
"grad_norm": 0.10074414312839508,
"learning_rate": 0.00017827796786604042,
"loss": 0.04416438341140747,
"mean_token_accuracy": 0.979088181257248,
"num_tokens": 5814598.0,
"step": 1140
},
{
"entropy": 0.04654768798500299,
"epoch": 0.7733333333333333,
"grad_norm": 0.07522693276405334,
"learning_rate": 0.0001773549218071105,
"loss": 0.0432561069726944,
"mean_token_accuracy": 0.9793283045291901,
"num_tokens": 5916277.0,
"step": 1160
},
{
"entropy": 0.0449189274571836,
"epoch": 0.7866666666666666,
"grad_norm": 0.12037090212106705,
"learning_rate": 0.00017641516608745114,
"loss": 0.04267836213111877,
"mean_token_accuracy": 0.9796097055077553,
"num_tokens": 6018305.0,
"step": 1180
},
{
"entropy": 0.04518893817439675,
"epoch": 0.8,
"grad_norm": 0.15295696258544922,
"learning_rate": 0.0001754589037064175,
"loss": 0.04324706792831421,
"mean_token_accuracy": 0.9793181642889977,
"num_tokens": 6120161.0,
"step": 1200
},
{
"entropy": 0.0459614584222436,
"epoch": 0.8133333333333334,
"grad_norm": 0.10844975709915161,
"learning_rate": 0.0001744863412290165,
"loss": 0.04338730275630951,
"mean_token_accuracy": 0.9787795886397361,
"num_tokens": 6221926.0,
"step": 1220
},
{
"entropy": 0.04700327459722757,
"epoch": 0.8266666666666667,
"grad_norm": 0.12464659661054611,
"learning_rate": 0.00017349768874128603,
"loss": 0.04424178600311279,
"mean_token_accuracy": 0.9791146576404571,
"num_tokens": 6323994.0,
"step": 1240
},
{
"entropy": 0.045251396391540764,
"epoch": 0.84,
"grad_norm": 0.10585556924343109,
"learning_rate": 0.00017249315980491373,
"loss": 0.04233089089393616,
"mean_token_accuracy": 0.980115057528019,
"num_tokens": 6425801.0,
"step": 1260
},
{
"entropy": 0.04711138280108571,
"epoch": 0.8533333333333334,
"grad_norm": 0.10078904032707214,
"learning_rate": 0.0001714729714111049,
"loss": 0.043426957726478574,
"mean_token_accuracy": 0.9791831955313682,
"num_tokens": 6527510.0,
"step": 1280
},
{
"entropy": 0.04563735323026776,
"epoch": 0.8666666666666667,
"grad_norm": 0.10202273726463318,
"learning_rate": 0.00017043734393370965,
"loss": 0.043241679668426514,
"mean_token_accuracy": 0.9791531518101693,
"num_tokens": 6630052.0,
"step": 1300
},
{
"entropy": 0.04624767201021314,
"epoch": 0.88,
"grad_norm": 0.1017850786447525,
"learning_rate": 0.0001693865010816192,
"loss": 0.043641078472137454,
"mean_token_accuracy": 0.9791532784700394,
"num_tokens": 6732187.0,
"step": 1320
},
{
"entropy": 0.04555416237562895,
"epoch": 0.8933333333333333,
"grad_norm": 0.0906793549656868,
"learning_rate": 0.00016832066985044195,
"loss": 0.04301130175590515,
"mean_token_accuracy": 0.9790184095501899,
"num_tokens": 6834270.0,
"step": 1340
},
{
"entropy": 0.044891719426959756,
"epoch": 0.9066666666666666,
"grad_norm": 0.06667148321866989,
"learning_rate": 0.00016724008047346947,
"loss": 0.04192114770412445,
"mean_token_accuracy": 0.9799642145633698,
"num_tokens": 6936310.0,
"step": 1360
},
{
"entropy": 0.04586669374257326,
"epoch": 0.92,
"grad_norm": 0.12085918337106705,
"learning_rate": 0.0001661449663719432,
"loss": 0.04404585361480713,
"mean_token_accuracy": 0.9786775410175323,
"num_tokens": 7037928.0,
"step": 1380
},
{
"entropy": 0.04691507248207927,
"epoch": 0.9333333333333333,
"grad_norm": 0.09447435289621353,
"learning_rate": 0.00016503556410463234,
"loss": 0.04427667260169983,
"mean_token_accuracy": 0.9788988634943963,
"num_tokens": 7139966.0,
"step": 1400
},
{
"entropy": 0.04686050089076162,
"epoch": 0.9466666666666667,
"grad_norm": 0.07748451828956604,
"learning_rate": 0.0001639121133167342,
"loss": 0.043699628114700316,
"mean_token_accuracy": 0.9789900943636894,
"num_tokens": 7242243.0,
"step": 1420
},
{
"entropy": 0.04621442370116711,
"epoch": 0.96,
"grad_norm": 0.0875391811132431,
"learning_rate": 0.0001627748566881077,
"loss": 0.0435163140296936,
"mean_token_accuracy": 0.9793973177671432,
"num_tokens": 7344333.0,
"step": 1440
},
{
"entropy": 0.04617999196052551,
"epoch": 0.9733333333333334,
"grad_norm": 0.11651453375816345,
"learning_rate": 0.00016162403988085147,
"loss": 0.0438153475522995,
"mean_token_accuracy": 0.9788163512945175,
"num_tokens": 7446501.0,
"step": 1460
},
{
"entropy": 0.04541895473375916,
"epoch": 0.9866666666666667,
"grad_norm": 0.10714145004749298,
"learning_rate": 0.0001604599114862375,
"loss": 0.043173199892044066,
"mean_token_accuracy": 0.9791891872882843,
"num_tokens": 7548187.0,
"step": 1480
},
{
"entropy": 0.04610758051276207,
"epoch": 1.0,
"grad_norm": 0.1056915670633316,
"learning_rate": 0.0001592827229710124,
"loss": 0.04365978240966797,
"mean_token_accuracy": 0.9787515595555305,
"num_tokens": 7650185.0,
"step": 1500
},
{
"entropy": 0.04553080843761563,
"epoch": 1.0133333333333334,
"grad_norm": 0.08358001708984375,
"learning_rate": 0.00015809272862307724,
"loss": 0.04281379580497742,
"mean_token_accuracy": 0.9787902727723121,
"num_tokens": 7751822.0,
"step": 1520
},
{
"entropy": 0.04557240409776568,
"epoch": 1.0266666666666666,
"grad_norm": 0.0894247367978096,
"learning_rate": 0.00015689018549655813,
"loss": 0.043633687496185306,
"mean_token_accuracy": 0.9793074056506157,
"num_tokens": 7853924.0,
"step": 1540
},
{
"entropy": 0.04621814098209143,
"epoch": 1.04,
"grad_norm": 0.060622621327638626,
"learning_rate": 0.00015567535335627916,
"loss": 0.043806785345077516,
"mean_token_accuracy": 0.9790619671344757,
"num_tokens": 7955729.0,
"step": 1560
},
{
"entropy": 0.04529289873316884,
"epoch": 1.0533333333333332,
"grad_norm": 0.06778731197118759,
"learning_rate": 0.0001544484946216499,
"loss": 0.04349397122859955,
"mean_token_accuracy": 0.9791216805577279,
"num_tokens": 8057521.0,
"step": 1580
},
{
"entropy": 0.045565437898039816,
"epoch": 1.0666666666666667,
"grad_norm": 0.09741676598787308,
"learning_rate": 0.00015320987430997939,
"loss": 0.043324217200279236,
"mean_token_accuracy": 0.9791115581989288,
"num_tokens": 8159337.0,
"step": 1600
},
{
"entropy": 0.04597685588523746,
"epoch": 1.08,
"grad_norm": 0.09679801762104034,
"learning_rate": 0.00015195975997922892,
"loss": 0.04302051663398743,
"mean_token_accuracy": 0.9793232962489128,
"num_tokens": 8262074.0,
"step": 1620
},
{
"entropy": 0.04526777658611536,
"epoch": 1.0933333333333333,
"grad_norm": 0.10501035302877426,
"learning_rate": 0.00015069842167021635,
"loss": 0.043459060788154605,
"mean_token_accuracy": 0.9790220081806182,
"num_tokens": 8363286.0,
"step": 1640
},
{
"entropy": 0.04562727101147175,
"epoch": 1.1066666666666667,
"grad_norm": 0.07695911824703217,
"learning_rate": 0.00014942613184828335,
"loss": 0.04361176192760467,
"mean_token_accuracy": 0.978962479531765,
"num_tokens": 8464992.0,
"step": 1660
},
{
"entropy": 0.04388966728001833,
"epoch": 1.12,
"grad_norm": 0.10466761142015457,
"learning_rate": 0.00014814316534443982,
"loss": 0.04218283891677856,
"mean_token_accuracy": 0.9791669443249702,
"num_tokens": 8567083.0,
"step": 1680
},
{
"entropy": 0.04554249225184322,
"epoch": 1.1333333333333333,
"grad_norm": 0.07236190885305405,
"learning_rate": 0.0001468497992959965,
"loss": 0.043398627638816835,
"mean_token_accuracy": 0.9791699111461639,
"num_tokens": 8669135.0,
"step": 1700
},
{
"entropy": 0.043595219124108554,
"epoch": 1.1466666666666667,
"grad_norm": 0.06271807104349136,
"learning_rate": 0.00014554631308669994,
"loss": 0.042030200362205505,
"mean_token_accuracy": 0.979636350274086,
"num_tokens": 8771085.0,
"step": 1720
},
{
"entropy": 0.04456626381725073,
"epoch": 1.16,
"grad_norm": 0.11451169848442078,
"learning_rate": 0.00014423298828638195,
"loss": 0.04222625195980072,
"mean_token_accuracy": 0.9794944658875465,
"num_tokens": 8873283.0,
"step": 1740
},
{
"entropy": 0.04446439165621996,
"epoch": 1.1733333333333333,
"grad_norm": 0.1023312583565712,
"learning_rate": 0.00014291010859013688,
"loss": 0.04255003333091736,
"mean_token_accuracy": 0.979724471271038,
"num_tokens": 8975472.0,
"step": 1760
},
{
"entropy": 0.04486837210133672,
"epoch": 1.1866666666666668,
"grad_norm": 0.10332223773002625,
"learning_rate": 0.00014157795975703986,
"loss": 0.04269057214260101,
"mean_token_accuracy": 0.9796782404184341,
"num_tokens": 9078026.0,
"step": 1780
},
{
"entropy": 0.04620604543015361,
"epoch": 1.2,
"grad_norm": 0.06070537120103836,
"learning_rate": 0.00014023682954841907,
"loss": 0.044662383198738095,
"mean_token_accuracy": 0.9784179985523224,
"num_tokens": 9180444.0,
"step": 1800
},
{
"entropy": 0.04559714160859585,
"epoch": 1.2133333333333334,
"grad_norm": 0.18560439348220825,
"learning_rate": 0.00013888700766569566,
"loss": 0.04349713623523712,
"mean_token_accuracy": 0.9794085487723351,
"num_tokens": 9282562.0,
"step": 1820
},
{
"entropy": 0.0467754821293056,
"epoch": 1.2266666666666666,
"grad_norm": 0.08615751564502716,
"learning_rate": 0.00013752878568780446,
"loss": 0.04393337666988373,
"mean_token_accuracy": 0.97873145788908,
"num_tokens": 9384267.0,
"step": 1840
},
{
"entropy": 0.04674078449606896,
"epoch": 1.24,
"grad_norm": 0.1094692274928093,
"learning_rate": 0.00013616245700820922,
"loss": 0.04425840079784393,
"mean_token_accuracy": 0.9783810645341873,
"num_tokens": 9486293.0,
"step": 1860
},
{
"entropy": 0.04517263481393456,
"epoch": 1.2533333333333334,
"grad_norm": 0.0624544620513916,
"learning_rate": 0.0001347883167715258,
"loss": 0.04288272559642792,
"mean_token_accuracy": 0.9790759727358818,
"num_tokens": 9587687.0,
"step": 1880
},
{
"entropy": 0.045213503576815126,
"epoch": 1.2666666666666666,
"grad_norm": 0.1179802417755127,
"learning_rate": 0.00013340666180976712,
"loss": 0.04305934309959412,
"mean_token_accuracy": 0.9792578309774399,
"num_tokens": 9689568.0,
"step": 1900
},
{
"entropy": 0.04414475904777646,
"epoch": 1.28,
"grad_norm": 0.10094133019447327,
"learning_rate": 0.0001320177905782236,
"loss": 0.04242780804634094,
"mean_token_accuracy": 0.9795284524559975,
"num_tokens": 9791805.0,
"step": 1920
},
{
"entropy": 0.04556956263259053,
"epoch": 1.2933333333333334,
"grad_norm": 0.07614333927631378,
"learning_rate": 0.0001306220030909931,
"loss": 0.043446135520935056,
"mean_token_accuracy": 0.9790474250912666,
"num_tokens": 9893871.0,
"step": 1940
},
{
"entropy": 0.04372665649279952,
"epoch": 1.3066666666666666,
"grad_norm": 0.09622333198785782,
"learning_rate": 0.00012921960085617373,
"loss": 0.04184481799602509,
"mean_token_accuracy": 0.979928120970726,
"num_tokens": 9995743.0,
"step": 1960
},
{
"entropy": 0.04449463188648224,
"epoch": 1.32,
"grad_norm": 0.08018497377634048,
"learning_rate": 0.0001278108868107346,
"loss": 0.043444639444351195,
"mean_token_accuracy": 0.979103796184063,
"num_tokens": 10097341.0,
"step": 1980
},
{
"entropy": 0.04594048615545034,
"epoch": 1.3333333333333333,
"grad_norm": 0.08098988234996796,
"learning_rate": 0.00012639616525507717,
"loss": 0.04326811134815216,
"mean_token_accuracy": 0.9793805435299874,
"num_tokens": 10199817.0,
"step": 2000
},
{
"entropy": 0.044195070117712024,
"epoch": 1.3466666666666667,
"grad_norm": 0.07928124070167542,
"learning_rate": 0.00012497574178730266,
"loss": 0.04292008876800537,
"mean_token_accuracy": 0.979155270755291,
"num_tokens": 10301704.0,
"step": 2020
},
{
"entropy": 0.04565720958635211,
"epoch": 1.3599999999999999,
"grad_norm": 0.07645630836486816,
"learning_rate": 0.00012354992323719877,
"loss": 0.04377688765525818,
"mean_token_accuracy": 0.9790802374482155,
"num_tokens": 10404032.0,
"step": 2040
},
{
"entropy": 0.044813665375113484,
"epoch": 1.3733333333333333,
"grad_norm": 0.0589720793068409,
"learning_rate": 0.0001221190175999606,
"loss": 0.04262206256389618,
"mean_token_accuracy": 0.9795415893197059,
"num_tokens": 10505610.0,
"step": 2060
},
{
"entropy": 0.04555217456072569,
"epoch": 1.3866666666666667,
"grad_norm": 0.11566988378763199,
"learning_rate": 0.00012068333396965968,
"loss": 0.04380977749824524,
"mean_token_accuracy": 0.9788099125027656,
"num_tokens": 10606782.0,
"step": 2080
},
{
"entropy": 0.04532764628529549,
"epoch": 1.4,
"grad_norm": 0.086255744099617,
"learning_rate": 0.00011924318247247568,
"loss": 0.04329647421836853,
"mean_token_accuracy": 0.9791126802563668,
"num_tokens": 10708263.0,
"step": 2100
},
{
"entropy": 0.04514106567949057,
"epoch": 1.4133333333333333,
"grad_norm": 0.06086282059550285,
"learning_rate": 0.00011779887419970512,
"loss": 0.04245937764644623,
"mean_token_accuracy": 0.9797914355993271,
"num_tokens": 10810300.0,
"step": 2120
},
{
"entropy": 0.04454901767894626,
"epoch": 1.4266666666666667,
"grad_norm": 0.07433643192052841,
"learning_rate": 0.00011635072114056162,
"loss": 0.043132221698760985,
"mean_token_accuracy": 0.9791502475738525,
"num_tokens": 10912165.0,
"step": 2140
},
{
"entropy": 0.04529751744121313,
"epoch": 1.44,
"grad_norm": 0.13444772362709045,
"learning_rate": 0.00011489903611478229,
"loss": 0.043829315900802614,
"mean_token_accuracy": 0.9784928604960441,
"num_tokens": 11014107.0,
"step": 2160
},
{
"entropy": 0.045276003703474996,
"epoch": 1.4533333333333334,
"grad_norm": 0.06211255118250847,
"learning_rate": 0.00011344413270505457,
"loss": 0.04307844340801239,
"mean_token_accuracy": 0.9793669879436493,
"num_tokens": 11116149.0,
"step": 2180
},
{
"entropy": 0.04517210628837347,
"epoch": 1.4666666666666668,
"grad_norm": 0.07761016488075256,
"learning_rate": 0.00011198632518927832,
"loss": 0.04319383502006531,
"mean_token_accuracy": 0.9791072577238082,
"num_tokens": 11217550.0,
"step": 2200
},
{
"entropy": 0.043730517756193875,
"epoch": 1.48,
"grad_norm": 0.08502429723739624,
"learning_rate": 0.00011052592847267781,
"loss": 0.0423270434141159,
"mean_token_accuracy": 0.9796715095639229,
"num_tokens": 11319372.0,
"step": 2220
},
{
"entropy": 0.04452117690816522,
"epoch": 1.4933333333333334,
"grad_norm": 0.06671646982431412,
"learning_rate": 0.00010906325801977804,
"loss": 0.04296606779098511,
"mean_token_accuracy": 0.9795390352606773,
"num_tokens": 11421402.0,
"step": 2240
},
{
"entropy": 0.04468898214399815,
"epoch": 1.5066666666666668,
"grad_norm": 0.08121279627084732,
"learning_rate": 0.00010759862978626031,
"loss": 0.04153239727020264,
"mean_token_accuracy": 0.9799500927329063,
"num_tokens": 11523747.0,
"step": 2260
},
{
"entropy": 0.04545955043286085,
"epoch": 1.52,
"grad_norm": 0.05693936347961426,
"learning_rate": 0.00010613236015071195,
"loss": 0.04396485388278961,
"mean_token_accuracy": 0.9788213685154915,
"num_tokens": 11625877.0,
"step": 2280
},
{
"entropy": 0.046351166628301146,
"epoch": 1.5333333333333332,
"grad_norm": 0.09166613221168518,
"learning_rate": 0.00010466476584628413,
"loss": 0.043498843908309937,
"mean_token_accuracy": 0.9791526988148689,
"num_tokens": 11727555.0,
"step": 2300
},
{
"entropy": 0.045797071792185305,
"epoch": 1.5466666666666666,
"grad_norm": 0.0821656882762909,
"learning_rate": 0.00010319616389227369,
"loss": 0.043224507570266725,
"mean_token_accuracy": 0.9792197465896606,
"num_tokens": 11829191.0,
"step": 2320
},
{
"entropy": 0.0452940653078258,
"epoch": 1.56,
"grad_norm": 0.07786799967288971,
"learning_rate": 0.00010172687152564273,
"loss": 0.04384516477584839,
"mean_token_accuracy": 0.9784497052431107,
"num_tokens": 11931301.0,
"step": 2340
},
{
"entropy": 0.04483237583190203,
"epoch": 1.5733333333333333,
"grad_norm": 0.08482241630554199,
"learning_rate": 0.00010025720613249136,
"loss": 0.04273432493209839,
"mean_token_accuracy": 0.9794994488358497,
"num_tokens": 12033500.0,
"step": 2360
},
{
"entropy": 0.045613698475062844,
"epoch": 1.5866666666666667,
"grad_norm": 0.0863715335726738,
"learning_rate": 9.878748517949829e-05,
"loss": 0.04371984004974365,
"mean_token_accuracy": 0.9791261553764343,
"num_tokens": 12135440.0,
"step": 2380
},
{
"entropy": 0.04589881300926209,
"epoch": 1.6,
"grad_norm": 0.062190357595682144,
"learning_rate": 9.731802614534383e-05,
"loss": 0.04390855133533478,
"mean_token_accuracy": 0.9788092419505119,
"num_tokens": 12237789.0,
"step": 2400
},
{
"entropy": 0.04429604625329375,
"epoch": 1.6133333333333333,
"grad_norm": 0.06404758989810944,
"learning_rate": 9.584914645213045e-05,
"loss": 0.042604264616966245,
"mean_token_accuracy": 0.9796271160244941,
"num_tokens": 12339966.0,
"step": 2420
},
{
"entropy": 0.04499910678714514,
"epoch": 1.6266666666666667,
"grad_norm": 0.06570903211832047,
"learning_rate": 9.438116339681545e-05,
"loss": 0.04222431182861328,
"mean_token_accuracy": 0.9794401109218598,
"num_tokens": 12441867.0,
"step": 2440
},
{
"entropy": 0.04458219092339277,
"epoch": 1.6400000000000001,
"grad_norm": 0.06039030849933624,
"learning_rate": 9.291439408267093e-05,
"loss": 0.04276288151741028,
"mean_token_accuracy": 0.9794755399227142,
"num_tokens": 12544334.0,
"step": 2460
},
{
"entropy": 0.04523820038884878,
"epoch": 1.6533333333333333,
"grad_norm": 0.09730029851198196,
"learning_rate": 9.144915535078509e-05,
"loss": 0.043028077483177184,
"mean_token_accuracy": 0.9791945442557335,
"num_tokens": 12646733.0,
"step": 2480
},
{
"entropy": 0.04477119510993362,
"epoch": 1.6666666666666665,
"grad_norm": 0.0753539651632309,
"learning_rate": 8.998576371162073e-05,
"loss": 0.04317043125629425,
"mean_token_accuracy": 0.9792640700936317,
"num_tokens": 12748659.0,
"step": 2500
},
{
"entropy": 0.044788467884063723,
"epoch": 1.6800000000000002,
"grad_norm": 0.07562968134880066,
"learning_rate": 8.852453527664466e-05,
"loss": 0.04256285130977631,
"mean_token_accuracy": 0.979301193356514,
"num_tokens": 12850375.0,
"step": 2520
},
{
"entropy": 0.045563530456274745,
"epoch": 1.6933333333333334,
"grad_norm": 0.08481646329164505,
"learning_rate": 8.706578569004392e-05,
"loss": 0.043007442355155946,
"mean_token_accuracy": 0.9794534996151925,
"num_tokens": 12952926.0,
"step": 2540
},
{
"entropy": 0.04439763380214572,
"epoch": 1.7066666666666666,
"grad_norm": 0.07377834618091583,
"learning_rate": 8.560983006054208e-05,
"loss": 0.04233894348144531,
"mean_token_accuracy": 0.9793659463524819,
"num_tokens": 13055094.0,
"step": 2560
},
{
"entropy": 0.04448066912591457,
"epoch": 1.72,
"grad_norm": 0.06845632195472717,
"learning_rate": 8.415698289333213e-05,
"loss": 0.04230453968048096,
"mean_token_accuracy": 0.9793373107910156,
"num_tokens": 13157565.0,
"step": 2580
},
{
"entropy": 0.04516846965998411,
"epoch": 1.7333333333333334,
"grad_norm": 0.0826217532157898,
"learning_rate": 8.270755802213896e-05,
"loss": 0.043338698148727414,
"mean_token_accuracy": 0.9791581705212593,
"num_tokens": 13259373.0,
"step": 2600
},
{
"entropy": 0.045483655855059625,
"epoch": 1.7466666666666666,
"grad_norm": 0.09278784692287445,
"learning_rate": 8.126186854142752e-05,
"loss": 0.043374094367027285,
"mean_token_accuracy": 0.9789844870567321,
"num_tokens": 13361653.0,
"step": 2620
},
{
"entropy": 0.044713820703327654,
"epoch": 1.76,
"grad_norm": 0.06657784432172775,
"learning_rate": 7.982022673877022e-05,
"loss": 0.04237607717514038,
"mean_token_accuracy": 0.9793095976114273,
"num_tokens": 13463283.0,
"step": 2640
},
{
"entropy": 0.044877147488296035,
"epoch": 1.7733333333333334,
"grad_norm": 0.08266546577215195,
"learning_rate": 7.838294402738875e-05,
"loss": 0.04311709105968475,
"mean_token_accuracy": 0.9791682615876198,
"num_tokens": 13565428.0,
"step": 2660
},
{
"entropy": 0.04468537019565701,
"epoch": 1.7866666666666666,
"grad_norm": 0.07597433030605316,
"learning_rate": 7.695033087888489e-05,
"loss": 0.0424690306186676,
"mean_token_accuracy": 0.9796170979738236,
"num_tokens": 13667448.0,
"step": 2680
},
{
"entropy": 0.04455111119896173,
"epoch": 1.8,
"grad_norm": 0.06538581848144531,
"learning_rate": 7.55226967561746e-05,
"loss": 0.04193790853023529,
"mean_token_accuracy": 0.9794035986065864,
"num_tokens": 13769362.0,
"step": 2700
},
{
"entropy": 0.043454491440206765,
"epoch": 1.8133333333333335,
"grad_norm": 0.05730016157031059,
"learning_rate": 7.410035004664011e-05,
"loss": 0.04141553640365601,
"mean_token_accuracy": 0.9800622522830963,
"num_tokens": 13871782.0,
"step": 2720
},
{
"entropy": 0.044676115922629836,
"epoch": 1.8266666666666667,
"grad_norm": 0.04646085202693939,
"learning_rate": 7.268359799551416e-05,
"loss": 0.04284192621707916,
"mean_token_accuracy": 0.9793128624558449,
"num_tokens": 13973630.0,
"step": 2740
},
{
"entropy": 0.04494037302210927,
"epoch": 1.8399999999999999,
"grad_norm": 0.09230729192495346,
"learning_rate": 7.12727466395112e-05,
"loss": 0.043046200275421144,
"mean_token_accuracy": 0.9793307974934577,
"num_tokens": 14075906.0,
"step": 2760
},
{
"entropy": 0.045368336327373984,
"epoch": 1.8533333333333335,
"grad_norm": 0.04331463947892189,
"learning_rate": 6.986810074071932e-05,
"loss": 0.042864075303077696,
"mean_token_accuracy": 0.978898110985756,
"num_tokens": 14177856.0,
"step": 2780
},
{
"entropy": 0.04510376630350947,
"epoch": 1.8666666666666667,
"grad_norm": 0.09033851325511932,
"learning_rate": 6.846996372076786e-05,
"loss": 0.04259768426418305,
"mean_token_accuracy": 0.9792723521590233,
"num_tokens": 14280019.0,
"step": 2800
},
{
"entropy": 0.04520597280934453,
"epoch": 1.88,
"grad_norm": 0.04347246140241623,
"learning_rate": 6.707863759528446e-05,
"loss": 0.043121880292892455,
"mean_token_accuracy": 0.9790245160460472,
"num_tokens": 14382127.0,
"step": 2820
},
{
"entropy": 0.045137868728488684,
"epoch": 1.8933333333333333,
"grad_norm": 0.08444561064243317,
"learning_rate": 6.569442290865564e-05,
"loss": 0.042786693572998045,
"mean_token_accuracy": 0.9794920086860657,
"num_tokens": 14484156.0,
"step": 2840
},
{
"entropy": 0.0450214795768261,
"epoch": 1.9066666666666667,
"grad_norm": 0.06270349770784378,
"learning_rate": 6.431761866910549e-05,
"loss": 0.04266757369041443,
"mean_token_accuracy": 0.9790657863020897,
"num_tokens": 14586261.0,
"step": 2860
},
{
"entropy": 0.04571379153057933,
"epoch": 1.92,
"grad_norm": 0.059830646961927414,
"learning_rate": 6.294852228410585e-05,
"loss": 0.043165019154548644,
"mean_token_accuracy": 0.9789528846740723,
"num_tokens": 14688252.0,
"step": 2880
},
{
"entropy": 0.04564494509249926,
"epoch": 1.9333333333333333,
"grad_norm": 0.2881755828857422,
"learning_rate": 6.158742949613263e-05,
"loss": 0.042789730429649356,
"mean_token_accuracy": 0.9789565414190292,
"num_tokens": 14790706.0,
"step": 2900
},
{
"entropy": 0.04481498738750815,
"epoch": 1.9466666666666668,
"grad_norm": 0.0739307701587677,
"learning_rate": 6.023463431878159e-05,
"loss": 0.04184747338294983,
"mean_token_accuracy": 0.9795544907450676,
"num_tokens": 14892667.0,
"step": 2920
},
{
"entropy": 0.045400716736912726,
"epoch": 1.96,
"grad_norm": 0.0694345086812973,
"learning_rate": 5.889042897325755e-05,
"loss": 0.04274559020996094,
"mean_token_accuracy": 0.9791734784841537,
"num_tokens": 14994588.0,
"step": 2940
},
{
"entropy": 0.045871376898139714,
"epoch": 1.9733333333333334,
"grad_norm": 0.06866899877786636,
"learning_rate": 5.7555103825250914e-05,
"loss": 0.043129801750183105,
"mean_token_accuracy": 0.979410058259964,
"num_tokens": 15096814.0,
"step": 2960
},
{
"entropy": 0.04594316426664591,
"epoch": 1.9866666666666668,
"grad_norm": 0.07196313887834549,
"learning_rate": 5.622894732221482e-05,
"loss": 0.04333162605762482,
"mean_token_accuracy": 0.9789909616112709,
"num_tokens": 15198781.0,
"step": 2980
},
{
"entropy": 0.046280243806540965,
"epoch": 2.0,
"grad_norm": 0.07306694984436035,
"learning_rate": 5.491224593105695e-05,
"loss": 0.04286535978317261,
"mean_token_accuracy": 0.9792644336819649,
"num_tokens": 15300370.0,
"step": 3000
},
{
"entropy": 0.044749976880848405,
"epoch": 2.013333333333333,
"grad_norm": 0.06247550994157791,
"learning_rate": 5.360528407625873e-05,
"loss": 0.04155576527118683,
"mean_token_accuracy": 0.979676017165184,
"num_tokens": 15402333.0,
"step": 3020
},
{
"entropy": 0.045135741028934716,
"epoch": 2.026666666666667,
"grad_norm": 0.09815753251314163,
"learning_rate": 5.2308344078436344e-05,
"loss": 0.042350149154663085,
"mean_token_accuracy": 0.979559974372387,
"num_tokens": 15504158.0,
"step": 3040
},
{
"entropy": 0.045068098604679106,
"epoch": 2.04,
"grad_norm": 0.09551538527011871,
"learning_rate": 5.1021706093355414e-05,
"loss": 0.04268674254417419,
"mean_token_accuracy": 0.9792046830058098,
"num_tokens": 15605979.0,
"step": 3060
},
{
"entropy": 0.0467217774130404,
"epoch": 2.0533333333333332,
"grad_norm": 0.0750860869884491,
"learning_rate": 4.974564805141405e-05,
"loss": 0.04325474202632904,
"mean_token_accuracy": 0.9788183540105819,
"num_tokens": 15708226.0,
"step": 3080
},
{
"entropy": 0.045709628332406285,
"epoch": 2.066666666666667,
"grad_norm": 0.08207862824201584,
"learning_rate": 4.848044559760624e-05,
"loss": 0.043493375182151794,
"mean_token_accuracy": 0.9793010488152504,
"num_tokens": 15810035.0,
"step": 3100
},
{
"entropy": 0.04442885173484683,
"epoch": 2.08,
"grad_norm": 0.06018839031457901,
"learning_rate": 4.7226372031978735e-05,
"loss": 0.0418207585811615,
"mean_token_accuracy": 0.9797791764140129,
"num_tokens": 15912192.0,
"step": 3120
},
{
"entropy": 0.046121115796267986,
"epoch": 2.0933333333333333,
"grad_norm": 0.06739337742328644,
"learning_rate": 4.598369825059522e-05,
"loss": 0.04348099529743195,
"mean_token_accuracy": 0.9789452716708184,
"num_tokens": 16013752.0,
"step": 3140
},
{
"entropy": 0.04560723854228854,
"epoch": 2.1066666666666665,
"grad_norm": 0.05784814432263374,
"learning_rate": 4.475269268701868e-05,
"loss": 0.04268187880516052,
"mean_token_accuracy": 0.9791408717632294,
"num_tokens": 16115637.0,
"step": 3160
},
{
"entropy": 0.045645091123878954,
"epoch": 2.12,
"grad_norm": 0.05607442185282707,
"learning_rate": 4.353362125432674e-05,
"loss": 0.042373275756835936,
"mean_token_accuracy": 0.979694114625454,
"num_tokens": 16217990.0,
"step": 3180
},
{
"entropy": 0.04457983383908868,
"epoch": 2.1333333333333333,
"grad_norm": 0.09050878137350082,
"learning_rate": 4.232674728767082e-05,
"loss": 0.042291298508644104,
"mean_token_accuracy": 0.9795105144381523,
"num_tokens": 16319781.0,
"step": 3200
},
{
"entropy": 0.04519128203392029,
"epoch": 2.1466666666666665,
"grad_norm": 0.06114558130502701,
"learning_rate": 4.113233148739224e-05,
"loss": 0.04246037602424622,
"mean_token_accuracy": 0.9795787811279297,
"num_tokens": 16422036.0,
"step": 3220
},
{
"entropy": 0.045624539349228145,
"epoch": 2.16,
"grad_norm": 0.06515778601169586,
"learning_rate": 3.9950631862707964e-05,
"loss": 0.04316512644290924,
"mean_token_accuracy": 0.9788484647870064,
"num_tokens": 16524417.0,
"step": 3240
},
{
"entropy": 0.04569779820740223,
"epoch": 2.1733333333333333,
"grad_norm": 0.08130136877298355,
"learning_rate": 3.8781903675976775e-05,
"loss": 0.04316212832927704,
"mean_token_accuracy": 0.9789097234606743,
"num_tokens": 16626474.0,
"step": 3260
},
{
"entropy": 0.04466199018061161,
"epoch": 2.1866666666666665,
"grad_norm": 0.06522400677204132,
"learning_rate": 3.762639938755974e-05,
"loss": 0.04167875051498413,
"mean_token_accuracy": 0.979556742310524,
"num_tokens": 16728484.0,
"step": 3280
},
{
"entropy": 0.044957845285534856,
"epoch": 2.2,
"grad_norm": 0.07835223525762558,
"learning_rate": 3.648436860128525e-05,
"loss": 0.041939809918403625,
"mean_token_accuracy": 0.9797166779637336,
"num_tokens": 16830621.0,
"step": 3300
},
{
"entropy": 0.04469237914308906,
"epoch": 2.2133333333333334,
"grad_norm": 0.07076659053564072,
"learning_rate": 3.535605801053147e-05,
"loss": 0.04294973611831665,
"mean_token_accuracy": 0.9787584990262985,
"num_tokens": 16932449.0,
"step": 3320
},
{
"entropy": 0.044177047722041604,
"epoch": 2.2266666666666666,
"grad_norm": 0.0865534245967865,
"learning_rate": 3.424171134493756e-05,
"loss": 0.041136741638183594,
"mean_token_accuracy": 0.9797752141952515,
"num_tokens": 17034746.0,
"step": 3340
},
{
"entropy": 0.044158230628818275,
"epoch": 2.24,
"grad_norm": 0.09348734468221664,
"learning_rate": 3.314156931775449e-05,
"loss": 0.04184678792953491,
"mean_token_accuracy": 0.979484710097313,
"num_tokens": 17137032.0,
"step": 3360
},
{
"entropy": 0.04505048170685768,
"epoch": 2.2533333333333334,
"grad_norm": 0.04819338768720627,
"learning_rate": 3.205586957384838e-05,
"loss": 0.04278863370418549,
"mean_token_accuracy": 0.9789488822221756,
"num_tokens": 17238981.0,
"step": 3380
},
{
"entropy": 0.044143668562173846,
"epoch": 2.2666666666666666,
"grad_norm": 0.08243514597415924,
"learning_rate": 3.09848466383657e-05,
"loss": 0.04165869653224945,
"mean_token_accuracy": 0.9797174796462059,
"num_tokens": 17341204.0,
"step": 3400
},
{
"entropy": 0.04463189765810967,
"epoch": 2.2800000000000002,
"grad_norm": 0.06700066477060318,
"learning_rate": 2.9928731866073135e-05,
"loss": 0.041824132204055786,
"mean_token_accuracy": 0.9796530723571777,
"num_tokens": 17443109.0,
"step": 3420
},
{
"entropy": 0.044507946353405714,
"epoch": 2.2933333333333334,
"grad_norm": 0.059370577335357666,
"learning_rate": 2.8887753391381924e-05,
"loss": 0.04232283234596253,
"mean_token_accuracy": 0.9795172438025475,
"num_tokens": 17544670.0,
"step": 3440
},
{
"entropy": 0.04427545545622706,
"epoch": 2.3066666666666666,
"grad_norm": 0.08195611089468002,
"learning_rate": 2.7862136079067646e-05,
"loss": 0.042314866185188295,
"mean_token_accuracy": 0.9798214435577393,
"num_tokens": 17647059.0,
"step": 3460
},
{
"entropy": 0.04503831313923001,
"epoch": 2.32,
"grad_norm": 0.06154360994696617,
"learning_rate": 2.6852101475696843e-05,
"loss": 0.04239094257354736,
"mean_token_accuracy": 0.979605621099472,
"num_tokens": 17749005.0,
"step": 3480
},
{
"entropy": 0.04526049355044961,
"epoch": 2.3333333333333335,
"grad_norm": 0.07333716750144958,
"learning_rate": 2.585786776176985e-05,
"loss": 0.04255903661251068,
"mean_token_accuracy": 0.9788812786340714,
"num_tokens": 17851383.0,
"step": 3500
},
{
"entropy": 0.04530645264312625,
"epoch": 2.3466666666666667,
"grad_norm": 0.06190125271677971,
"learning_rate": 2.487964970459118e-05,
"loss": 0.042575931549072264,
"mean_token_accuracy": 0.9791432306170463,
"num_tokens": 17953577.0,
"step": 3520
},
{
"entropy": 0.04435355756431818,
"epoch": 2.36,
"grad_norm": 0.08465747535228729,
"learning_rate": 2.3917658611876904e-05,
"loss": 0.04138871431350708,
"mean_token_accuracy": 0.9799614399671555,
"num_tokens": 18055293.0,
"step": 3540
},
{
"entropy": 0.04456534581258893,
"epoch": 2.3733333333333335,
"grad_norm": 0.0772717297077179,
"learning_rate": 2.297210228610952e-05,
"loss": 0.04198825061321258,
"mean_token_accuracy": 0.9794510439038276,
"num_tokens": 18157289.0,
"step": 3560
},
{
"entropy": 0.04461102448403835,
"epoch": 2.3866666666666667,
"grad_norm": 0.08000056445598602,
"learning_rate": 2.2043184979649933e-05,
"loss": 0.041901758313179015,
"mean_token_accuracy": 0.9796808436512947,
"num_tokens": 18258778.0,
"step": 3580
},
{
"entropy": 0.04491544393822551,
"epoch": 2.4,
"grad_norm": 0.0720711350440979,
"learning_rate": 2.1131107350616187e-05,
"loss": 0.042588868737220766,
"mean_token_accuracy": 0.9793313190340995,
"num_tokens": 18360839.0,
"step": 3600
},
{
"entropy": 0.045493978820741174,
"epoch": 2.413333333333333,
"grad_norm": 0.09875239431858063,
"learning_rate": 2.0236066419538934e-05,
"loss": 0.04313438236713409,
"mean_token_accuracy": 0.9793697372078896,
"num_tokens": 18462252.0,
"step": 3620
},
{
"entropy": 0.04539180537685752,
"epoch": 2.4266666666666667,
"grad_norm": 0.04752529039978981,
"learning_rate": 1.9358255526802303e-05,
"loss": 0.041815349459648134,
"mean_token_accuracy": 0.9794102787971497,
"num_tokens": 18564453.0,
"step": 3640
},
{
"entropy": 0.044612882751971485,
"epoch": 2.44,
"grad_norm": 0.05158265680074692,
"learning_rate": 1.8497864290879953e-05,
"loss": 0.04235563278198242,
"mean_token_accuracy": 0.9792704641819,
"num_tokens": 18666497.0,
"step": 3660
},
{
"entropy": 0.045019051525741816,
"epoch": 2.453333333333333,
"grad_norm": 0.0648743286728859,
"learning_rate": 1.7655078567375028e-05,
"loss": 0.04204939901828766,
"mean_token_accuracy": 0.9794104173779488,
"num_tokens": 18768455.0,
"step": 3680
},
{
"entropy": 0.04469795366749167,
"epoch": 2.466666666666667,
"grad_norm": 0.05884250998497009,
"learning_rate": 1.683008040887285e-05,
"loss": 0.04209013283252716,
"mean_token_accuracy": 0.9796774923801422,
"num_tokens": 18870275.0,
"step": 3700
},
{
"entropy": 0.04474199656397104,
"epoch": 2.48,
"grad_norm": 0.051543645560741425,
"learning_rate": 1.6023048025615405e-05,
"loss": 0.04179444909095764,
"mean_token_accuracy": 0.9795808404684067,
"num_tokens": 18972156.0,
"step": 3720
},
{
"entropy": 0.04483764311298728,
"epoch": 2.493333333333333,
"grad_norm": 0.10630819946527481,
"learning_rate": 1.5234155747005486e-05,
"loss": 0.042180657386779785,
"mean_token_accuracy": 0.9794986173510551,
"num_tokens": 19074197.0,
"step": 3740
},
{
"entropy": 0.04558736402541399,
"epoch": 2.506666666666667,
"grad_norm": 0.08093755692243576,
"learning_rate": 1.4463573983949341e-05,
"loss": 0.04298904240131378,
"mean_token_accuracy": 0.9790481492877007,
"num_tokens": 19176367.0,
"step": 3760
},
{
"entropy": 0.04453156525269151,
"epoch": 2.52,
"grad_norm": 0.0727071687579155,
"learning_rate": 1.3711469192045723e-05,
"loss": 0.041091355681419375,
"mean_token_accuracy": 0.9804318726062775,
"num_tokens": 19278992.0,
"step": 3780
},
{
"entropy": 0.04554087147116661,
"epoch": 2.533333333333333,
"grad_norm": 0.0910055935382843,
"learning_rate": 1.297800383562926e-05,
"loss": 0.04345537126064301,
"mean_token_accuracy": 0.9786257922649384,
"num_tokens": 19380593.0,
"step": 3800
},
{
"entropy": 0.04596257032826543,
"epoch": 2.546666666666667,
"grad_norm": 0.0877053365111351,
"learning_rate": 1.2263336352676235e-05,
"loss": 0.04255788326263428,
"mean_token_accuracy": 0.9795473828911782,
"num_tokens": 19482278.0,
"step": 3820
},
{
"entropy": 0.044655687548220156,
"epoch": 2.56,
"grad_norm": 0.10276857763528824,
"learning_rate": 1.1567621120579753e-05,
"loss": 0.0418385773897171,
"mean_token_accuracy": 0.9795376226305962,
"num_tokens": 19584297.0,
"step": 3840
},
{
"entropy": 0.04575161607936025,
"epoch": 2.5733333333333333,
"grad_norm": 0.09059888869524002,
"learning_rate": 1.089100842280234e-05,
"loss": 0.042618009448051455,
"mean_token_accuracy": 0.9796013042330742,
"num_tokens": 19686257.0,
"step": 3860
},
{
"entropy": 0.04560979856178164,
"epoch": 2.586666666666667,
"grad_norm": 0.048925597220659256,
"learning_rate": 1.0233644416412791e-05,
"loss": 0.04292104840278625,
"mean_token_accuracy": 0.9794995337724686,
"num_tokens": 19788450.0,
"step": 3880
},
{
"entropy": 0.0455952113494277,
"epoch": 2.6,
"grad_norm": 0.048526402562856674,
"learning_rate": 9.595671100514214e-06,
"loss": 0.042637795209884644,
"mean_token_accuracy": 0.9797911092638969,
"num_tokens": 19890524.0,
"step": 3900
},
{
"entropy": 0.04548884928226471,
"epoch": 2.6133333333333333,
"grad_norm": 0.06042620167136192,
"learning_rate": 8.977226285570606e-06,
"loss": 0.04222815930843353,
"mean_token_accuracy": 0.9794741749763489,
"num_tokens": 19992209.0,
"step": 3920
},
{
"entropy": 0.045671455282717946,
"epoch": 2.626666666666667,
"grad_norm": 0.07702252268791199,
"learning_rate": 8.378443563637828e-06,
"loss": 0.042873308062553406,
"mean_token_accuracy": 0.9794026196002961,
"num_tokens": 20093703.0,
"step": 3940
},
{
"entropy": 0.04522231016308069,
"epoch": 2.64,
"grad_norm": 0.07133087515830994,
"learning_rate": 7.799452279506125e-06,
"loss": 0.042153152823448184,
"mean_token_accuracy": 0.9797803938388825,
"num_tokens": 20195947.0,
"step": 3960
},
{
"entropy": 0.04628952695056796,
"epoch": 2.6533333333333333,
"grad_norm": 0.06586236506700516,
"learning_rate": 7.240377502759932e-06,
"loss": 0.043617674708366395,
"mean_token_accuracy": 0.9784920737147331,
"num_tokens": 20298043.0,
"step": 3980
},
{
"entropy": 0.045405203476548195,
"epoch": 2.6666666666666665,
"grad_norm": 0.06839724630117416,
"learning_rate": 6.70134000076118e-06,
"loss": 0.04227378368377686,
"mean_token_accuracy": 0.979735977947712,
"num_tokens": 20399972.0,
"step": 4000
},
{
"entropy": 0.045020535588264465,
"epoch": 2.68,
"grad_norm": 0.07815848290920258,
"learning_rate": 6.182456212562093e-06,
"loss": 0.04192916452884674,
"mean_token_accuracy": 0.9796771243214607,
"num_tokens": 20501675.0,
"step": 4020
},
{
"entropy": 0.04609425235539675,
"epoch": 2.6933333333333334,
"grad_norm": 0.05290106683969498,
"learning_rate": 5.68383822375278e-06,
"loss": 0.042898637056350705,
"mean_token_accuracy": 0.9792009994387627,
"num_tokens": 20603651.0,
"step": 4040
},
{
"entropy": 0.0457917626015842,
"epoch": 2.7066666666666666,
"grad_norm": 0.0704483613371849,
"learning_rate": 5.205593742249326e-06,
"loss": 0.0423770546913147,
"mean_token_accuracy": 0.9790433034300804,
"num_tokens": 20705702.0,
"step": 4060
},
{
"entropy": 0.044912660401314496,
"epoch": 2.7199999999999998,
"grad_norm": 0.058434613049030304,
"learning_rate": 4.747826075027506e-06,
"loss": 0.04174522757530212,
"mean_token_accuracy": 0.9795982718467713,
"num_tokens": 20807336.0,
"step": 4080
},
{
"entropy": 0.045613402500748634,
"epoch": 2.7333333333333334,
"grad_norm": 0.08788046985864639,
"learning_rate": 4.310634105807065e-06,
"loss": 0.04344511330127716,
"mean_token_accuracy": 0.9793641656637192,
"num_tokens": 20909744.0,
"step": 4100
},
{
"entropy": 0.04498438341543078,
"epoch": 2.7466666666666666,
"grad_norm": 0.06054578721523285,
"learning_rate": 3.894112273691697e-06,
"loss": 0.041690278053283694,
"mean_token_accuracy": 0.9799363717436791,
"num_tokens": 21011520.0,
"step": 4120
},
{
"entropy": 0.04519799826666713,
"epoch": 2.76,
"grad_norm": 0.06741084903478622,
"learning_rate": 3.4983505527688586e-06,
"loss": 0.042607730627059935,
"mean_token_accuracy": 0.979535199701786,
"num_tokens": 21113638.0,
"step": 4140
},
{
"entropy": 0.04527061656117439,
"epoch": 2.7733333333333334,
"grad_norm": 0.053430285304784775,
"learning_rate": 3.1234344326742657e-06,
"loss": 0.04179522097110748,
"mean_token_accuracy": 0.979697409272194,
"num_tokens": 21215783.0,
"step": 4160
},
{
"entropy": 0.045730549935251476,
"epoch": 2.7866666666666666,
"grad_norm": 0.07262956351041794,
"learning_rate": 2.7694449001250512e-06,
"loss": 0.042841532826423646,
"mean_token_accuracy": 0.9794132426381111,
"num_tokens": 21317798.0,
"step": 4180
},
{
"entropy": 0.04552676072344184,
"epoch": 2.8,
"grad_norm": 0.06751976907253265,
"learning_rate": 2.4364584214254695e-06,
"loss": 0.04251702129840851,
"mean_token_accuracy": 0.9793218955397606,
"num_tokens": 21419787.0,
"step": 4200
},
{
"entropy": 0.045480293966829774,
"epoch": 2.8133333333333335,
"grad_norm": 0.0856935977935791,
"learning_rate": 2.124546925949389e-06,
"loss": 0.04228883981704712,
"mean_token_accuracy": 0.9794924795627594,
"num_tokens": 21521816.0,
"step": 4220
},
{
"entropy": 0.04522721925750375,
"epoch": 2.8266666666666667,
"grad_norm": 0.04721014201641083,
"learning_rate": 1.8337777906023978e-06,
"loss": 0.04205127358436585,
"mean_token_accuracy": 0.9795928984880448,
"num_tokens": 21623696.0,
"step": 4240
},
{
"entropy": 0.0451619129627943,
"epoch": 2.84,
"grad_norm": 0.06828150898218155,
"learning_rate": 1.5642138252677019e-06,
"loss": 0.041848546266555785,
"mean_token_accuracy": 0.9796140640974045,
"num_tokens": 21726066.0,
"step": 4260
},
{
"entropy": 0.04501318633556366,
"epoch": 2.8533333333333335,
"grad_norm": 0.08222071826457977,
"learning_rate": 1.3159132592382772e-06,
"loss": 0.04213366806507111,
"mean_token_accuracy": 0.9795982599258423,
"num_tokens": 21828178.0,
"step": 4280
},
{
"entropy": 0.0461537716910243,
"epoch": 2.8666666666666667,
"grad_norm": 0.0802520290017128,
"learning_rate": 1.0889297286386102e-06,
"loss": 0.04323468208312988,
"mean_token_accuracy": 0.9791506737470627,
"num_tokens": 21929963.0,
"step": 4300
},
{
"entropy": 0.04528212863951921,
"epoch": 2.88,
"grad_norm": 0.08974730968475342,
"learning_rate": 8.833122648386871e-07,
"loss": 0.042816996574401855,
"mean_token_accuracy": 0.9789806365966797,
"num_tokens": 22032092.0,
"step": 4320
},
{
"entropy": 0.045245842542499304,
"epoch": 2.8933333333333335,
"grad_norm": 0.05283057317137718,
"learning_rate": 6.991052838624113e-07,
"loss": 0.04174770712852478,
"mean_token_accuracy": 0.9798634141683579,
"num_tokens": 22134281.0,
"step": 4340
},
{
"entropy": 0.045284852758049964,
"epoch": 2.9066666666666667,
"grad_norm": 0.0722041130065918,
"learning_rate": 5.363485767933663e-07,
"loss": 0.041790124773979184,
"mean_token_accuracy": 0.979168464243412,
"num_tokens": 22236085.0,
"step": 4360
},
{
"entropy": 0.04504124140366912,
"epoch": 2.92,
"grad_norm": 0.06595401465892792,
"learning_rate": 3.9507730117926967e-07,
"loss": 0.04146735072135925,
"mean_token_accuracy": 0.9801181107759476,
"num_tokens": 22338053.0,
"step": 4380
},
{
"entropy": 0.04522117590531707,
"epoch": 2.9333333333333336,
"grad_norm": 0.06364521384239197,
"learning_rate": 2.7532197343758115e-07,
"loss": 0.04191155731678009,
"mean_token_accuracy": 0.9794103637337684,
"num_tokens": 22440208.0,
"step": 4400
},
{
"entropy": 0.045472448039799926,
"epoch": 2.9466666666666668,
"grad_norm": 0.0597660131752491,
"learning_rate": 1.7710846226355328e-07,
"loss": 0.04289998710155487,
"mean_token_accuracy": 0.9792811706662178,
"num_tokens": 22542219.0,
"step": 4420
},
{
"entropy": 0.04583751475438476,
"epoch": 2.96,
"grad_norm": 0.08572968095541,
"learning_rate": 1.0045798304220145e-07,
"loss": 0.0427745521068573,
"mean_token_accuracy": 0.9792221873998642,
"num_tokens": 22644025.0,
"step": 4440
},
{
"entropy": 0.04562570815905929,
"epoch": 2.9733333333333336,
"grad_norm": 0.0797945037484169,
"learning_rate": 4.5387093265591986e-08,
"loss": 0.04286653101444245,
"mean_token_accuracy": 0.9792360305786133,
"num_tokens": 22745968.0,
"step": 4460
},
{
"entropy": 0.045168190728873014,
"epoch": 2.986666666666667,
"grad_norm": 0.07274357974529266,
"learning_rate": 1.1907688956136477e-08,
"loss": 0.04201154708862305,
"mean_token_accuracy": 0.9799786448478699,
"num_tokens": 22848205.0,
"step": 4480
},
{
"entropy": 0.045816550869494675,
"epoch": 3.0,
"grad_norm": 0.06689723581075668,
"learning_rate": 2.70020969361795e-11,
"loss": 0.042978566884994504,
"mean_token_accuracy": 0.9794494539499283,
"num_tokens": 22950555.0,
"step": 4500
}
],
"logging_steps": 20,
"max_steps": 4500,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5251560037074944e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}