{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.082474226804124, "eval_steps": 1, "global_step": 52, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9448668956756592, "epoch": 0.041237113402061855, "grad_norm": 0.07226159423589706, "learning_rate": 0.0, "loss": 0.09753526747226715, "mean_token_accuracy": 0.7355087697505951, "num_tokens": 21635.0, "step": 1 }, { "epoch": 0.041237113402061855, "eval_entropy": 0.9608347535133361, "eval_loss": 0.08507955074310303, "eval_mean_token_accuracy": 0.9790845354398091, "eval_num_tokens": 21635.0, "eval_runtime": 22.1978, "eval_samples_per_second": 0.676, "eval_steps_per_second": 0.676, "step": 1 }, { "entropy": 0.9745071977376938, "epoch": 0.08247422680412371, "grad_norm": 0.06922032684087753, "learning_rate": 3.3333333333333335e-05, "loss": 0.1450115144252777, "mean_token_accuracy": 0.9680577963590622, "num_tokens": 33513.0, "step": 2 }, { "epoch": 0.08247422680412371, "eval_entropy": 0.9645299235979716, "eval_loss": 0.08592262864112854, "eval_mean_token_accuracy": 0.9785626331965128, "eval_num_tokens": 33513.0, "eval_runtime": 21.0428, "eval_samples_per_second": 0.713, "eval_steps_per_second": 0.713, "step": 2 }, { "entropy": 1.034986674785614, "epoch": 0.12371134020618557, "grad_norm": 0.09312178194522858, "learning_rate": 6.666666666666667e-05, "loss": 0.21599137783050537, "mean_token_accuracy": 0.9498571008443832, "num_tokens": 46880.0, "step": 3 }, { "epoch": 0.12371134020618557, "eval_entropy": 0.9633764783541362, "eval_loss": 0.0840531513094902, "eval_mean_token_accuracy": 0.9798958341280619, "eval_num_tokens": 46880.0, "eval_runtime": 21.0469, "eval_samples_per_second": 0.713, "eval_steps_per_second": 0.713, "step": 3 }, { "entropy": 0.9331036657094955, "epoch": 0.16494845360824742, "grad_norm": 0.044576335698366165, "learning_rate": 0.0001, "loss": 0.12804706394672394, "mean_token_accuracy": 0.9727730453014374, "num_tokens": 67820.0, "step": 4 }, { "epoch": 0.16494845360824742, "eval_entropy": 0.9656182010968526, "eval_loss": 0.08270721137523651, "eval_mean_token_accuracy": 0.9796255747477214, "eval_num_tokens": 67820.0, "eval_runtime": 20.7162, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 4 }, { "entropy": 0.9677977859973907, "epoch": 0.20618556701030927, "grad_norm": 0.07005823403596878, "learning_rate": 9.861111111111112e-05, "loss": 0.09690296649932861, "mean_token_accuracy": 0.9714849889278412, "num_tokens": 83538.0, "step": 5 }, { "epoch": 0.20618556701030927, "eval_entropy": 0.9692896882692973, "eval_loss": 0.07981620728969574, "eval_mean_token_accuracy": 0.9793731331825256, "eval_num_tokens": 83538.0, "eval_runtime": 20.7383, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 5 }, { "entropy": 0.9514970034360886, "epoch": 0.24742268041237114, "grad_norm": 0.060433391481637955, "learning_rate": 9.722222222222223e-05, "loss": 0.09173069894313812, "mean_token_accuracy": 0.9764781147241592, "num_tokens": 99438.0, "step": 6 }, { "epoch": 0.24742268041237114, "eval_entropy": 0.9704787929852804, "eval_loss": 0.07730450481176376, "eval_mean_token_accuracy": 0.9804888725280761, "eval_num_tokens": 99438.0, "eval_runtime": 20.7525, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 6 }, { "entropy": 0.9307974129915237, "epoch": 0.28865979381443296, "grad_norm": 0.05741553008556366, "learning_rate": 9.583333333333334e-05, "loss": 0.08163456618785858, "mean_token_accuracy": 0.9720405638217926, "num_tokens": 118641.0, "step": 7 }, { "epoch": 0.28865979381443296, "eval_entropy": 0.9690600077311198, "eval_loss": 0.07438266277313232, "eval_mean_token_accuracy": 0.9809866070747375, "eval_num_tokens": 118641.0, "eval_runtime": 20.7456, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 7 }, { "entropy": 0.9306564778089523, "epoch": 0.32989690721649484, "grad_norm": 0.04258479177951813, "learning_rate": 9.444444444444444e-05, "loss": 0.07382050156593323, "mean_token_accuracy": 0.9821059703826904, "num_tokens": 137462.0, "step": 8 }, { "epoch": 0.32989690721649484, "eval_entropy": 0.9716742753982544, "eval_loss": 0.07355732470750809, "eval_mean_token_accuracy": 0.9804219245910645, "eval_num_tokens": 137462.0, "eval_runtime": 20.697, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 8 }, { "entropy": 0.9266971349716187, "epoch": 0.3711340206185567, "grad_norm": 0.05563495308160782, "learning_rate": 9.305555555555556e-05, "loss": 0.1523049771785736, "mean_token_accuracy": 0.9683622866868973, "num_tokens": 153871.0, "step": 9 }, { "epoch": 0.3711340206185567, "eval_entropy": 0.9706975976626079, "eval_loss": 0.07187264412641525, "eval_mean_token_accuracy": 0.9810372312863668, "eval_num_tokens": 153871.0, "eval_runtime": 20.9986, "eval_samples_per_second": 0.714, "eval_steps_per_second": 0.714, "step": 9 }, { "entropy": 0.961930587887764, "epoch": 0.41237113402061853, "grad_norm": 0.05789535492658615, "learning_rate": 9.166666666666667e-05, "loss": 0.1482037901878357, "mean_token_accuracy": 0.9669816046953201, "num_tokens": 170912.0, "step": 10 }, { "epoch": 0.41237113402061853, "eval_entropy": 0.9823563456535339, "eval_loss": 0.07145611941814423, "eval_mean_token_accuracy": 0.9822565118471781, "eval_num_tokens": 170912.0, "eval_runtime": 21.0009, "eval_samples_per_second": 0.714, "eval_steps_per_second": 0.714, "step": 10 }, { "entropy": 0.960724487900734, "epoch": 0.4536082474226804, "grad_norm": 0.04560323432087898, "learning_rate": 9.027777777777779e-05, "loss": 0.07032930850982666, "mean_token_accuracy": 0.9809373319149017, "num_tokens": 187426.0, "step": 11 }, { "epoch": 0.4536082474226804, "eval_entropy": 0.9839813590049744, "eval_loss": 0.07051929086446762, "eval_mean_token_accuracy": 0.9812469045321147, "eval_num_tokens": 187426.0, "eval_runtime": 20.9549, "eval_samples_per_second": 0.716, "eval_steps_per_second": 0.716, "step": 11 }, { "entropy": 0.9719891250133514, "epoch": 0.4948453608247423, "grad_norm": 0.1134040355682373, "learning_rate": 8.888888888888889e-05, "loss": 0.13303802907466888, "mean_token_accuracy": 0.719734787940979, "num_tokens": 206626.0, "step": 12 }, { "epoch": 0.4948453608247423, "eval_entropy": 0.9883692304293314, "eval_loss": 0.07048198580741882, "eval_mean_token_accuracy": 0.9817201534907023, "eval_num_tokens": 206626.0, "eval_runtime": 20.9201, "eval_samples_per_second": 0.717, "eval_steps_per_second": 0.717, "step": 12 }, { "entropy": 0.9694001376628876, "epoch": 0.5360824742268041, "grad_norm": 0.04408908262848854, "learning_rate": 8.75e-05, "loss": 0.07910753041505814, "mean_token_accuracy": 0.9814473092556, "num_tokens": 228836.0, "step": 13 }, { "epoch": 0.5360824742268041, "eval_entropy": 0.9879396398862202, "eval_loss": 0.06983672827482224, "eval_mean_token_accuracy": 0.9817187468210856, "eval_num_tokens": 228836.0, "eval_runtime": 21.0036, "eval_samples_per_second": 0.714, "eval_steps_per_second": 0.714, "step": 13 }, { "entropy": 0.998613715171814, "epoch": 0.5773195876288659, "grad_norm": 0.12266044318675995, "learning_rate": 8.611111111111112e-05, "loss": 0.07777023315429688, "mean_token_accuracy": 0.9771675318479538, "num_tokens": 245137.0, "step": 14 }, { "epoch": 0.5773195876288659, "eval_entropy": 0.9935583511988322, "eval_loss": 0.06945585459470749, "eval_mean_token_accuracy": 0.981730604171753, "eval_num_tokens": 245137.0, "eval_runtime": 20.9802, "eval_samples_per_second": 0.715, "eval_steps_per_second": 0.715, "step": 14 }, { "entropy": 1.0273631364107132, "epoch": 0.6185567010309279, "grad_norm": 0.04747646674513817, "learning_rate": 8.472222222222222e-05, "loss": 0.047956813126802444, "mean_token_accuracy": 0.9845701456069946, "num_tokens": 257942.0, "step": 15 }, { "epoch": 0.6185567010309279, "eval_entropy": 0.9991331934928894, "eval_loss": 0.0690494030714035, "eval_mean_token_accuracy": 0.9824923793474833, "eval_num_tokens": 257942.0, "eval_runtime": 20.7211, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 15 }, { "entropy": 1.0196497589349747, "epoch": 0.6597938144329897, "grad_norm": 0.054021257907152176, "learning_rate": 8.333333333333334e-05, "loss": 0.11089849472045898, "mean_token_accuracy": 0.973548635840416, "num_tokens": 269457.0, "step": 16 }, { "epoch": 0.6597938144329897, "eval_entropy": 1.0019009590148926, "eval_loss": 0.06880267709493637, "eval_mean_token_accuracy": 0.9824360330899556, "eval_num_tokens": 269457.0, "eval_runtime": 20.739, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 16 }, { "entropy": 1.0412271618843079, "epoch": 0.7010309278350515, "grad_norm": 0.053728897124528885, "learning_rate": 8.194444444444445e-05, "loss": 0.11761482059955597, "mean_token_accuracy": 0.9692688584327698, "num_tokens": 280427.0, "step": 17 }, { "epoch": 0.7010309278350515, "eval_entropy": 1.005230430761973, "eval_loss": 0.06931651383638382, "eval_mean_token_accuracy": 0.9825660387674967, "eval_num_tokens": 280427.0, "eval_runtime": 20.6944, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 17 }, { "entropy": 0.9664316028356552, "epoch": 0.7422680412371134, "grad_norm": 0.03628600761294365, "learning_rate": 8.055555555555556e-05, "loss": 0.06299219280481339, "mean_token_accuracy": 0.9820192903280258, "num_tokens": 299201.0, "step": 18 }, { "epoch": 0.7422680412371134, "eval_entropy": 1.007782240708669, "eval_loss": 0.0696113333106041, "eval_mean_token_accuracy": 0.9823788642883301, "eval_num_tokens": 299201.0, "eval_runtime": 20.6991, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 18 }, { "entropy": 1.0266517251729965, "epoch": 0.7835051546391752, "grad_norm": 0.049643345177173615, "learning_rate": 7.916666666666666e-05, "loss": 0.07530396431684494, "mean_token_accuracy": 0.9791738986968994, "num_tokens": 311861.0, "step": 19 }, { "epoch": 0.7835051546391752, "eval_entropy": 1.0088731487592062, "eval_loss": 0.06953989714384079, "eval_mean_token_accuracy": 0.9839362303415934, "eval_num_tokens": 311861.0, "eval_runtime": 20.7517, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 19 }, { "entropy": 1.0745487809181213, "epoch": 0.8247422680412371, "grad_norm": 0.06066666916012764, "learning_rate": 7.777777777777778e-05, "loss": 0.1054162010550499, "mean_token_accuracy": 0.9649138450622559, "num_tokens": 325160.0, "step": 20 }, { "epoch": 0.8247422680412371, "eval_entropy": 1.0058039903640748, "eval_loss": 0.06919559836387634, "eval_mean_token_accuracy": 0.9831933895746867, "eval_num_tokens": 325160.0, "eval_runtime": 20.719, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 20 }, { "entropy": 1.0028491765260696, "epoch": 0.865979381443299, "grad_norm": 0.03964878246188164, "learning_rate": 7.638888888888889e-05, "loss": 0.09752834588289261, "mean_token_accuracy": 0.9719387143850327, "num_tokens": 339272.0, "step": 21 }, { "epoch": 0.865979381443299, "eval_entropy": 1.0079486091931662, "eval_loss": 0.06905102729797363, "eval_mean_token_accuracy": 0.983608094851176, "eval_num_tokens": 339272.0, "eval_runtime": 20.7, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 21 }, { "entropy": 1.037087768316269, "epoch": 0.9072164948453608, "grad_norm": 0.07998791337013245, "learning_rate": 7.500000000000001e-05, "loss": 0.24303758144378662, "mean_token_accuracy": 0.9511117488145828, "num_tokens": 358300.0, "step": 22 }, { "epoch": 0.9072164948453608, "eval_entropy": 1.0054601073265075, "eval_loss": 0.06891996413469315, "eval_mean_token_accuracy": 0.9832461039225261, "eval_num_tokens": 358300.0, "eval_runtime": 20.7064, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 22 }, { "entropy": 1.0226812064647675, "epoch": 0.9484536082474226, "grad_norm": 0.06699639558792114, "learning_rate": 7.361111111111111e-05, "loss": 0.09675773978233337, "mean_token_accuracy": 0.9756103903055191, "num_tokens": 369770.0, "step": 23 }, { "epoch": 0.9484536082474226, "eval_entropy": 1.002508318424225, "eval_loss": 0.06882204115390778, "eval_mean_token_accuracy": 0.983385169506073, "eval_num_tokens": 369770.0, "eval_runtime": 20.7395, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 23 }, { "entropy": 1.0239011645317078, "epoch": 0.9896907216494846, "grad_norm": 0.026397380977869034, "learning_rate": 7.222222222222222e-05, "loss": 0.03568102791905403, "mean_token_accuracy": 0.9908256679773331, "num_tokens": 383233.0, "step": 24 }, { "epoch": 0.9896907216494846, "eval_entropy": 0.9966842492421468, "eval_loss": 0.06899628043174744, "eval_mean_token_accuracy": 0.9829049468040466, "eval_num_tokens": 383233.0, "eval_runtime": 20.6818, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 24 }, { "entropy": 0.9550238251686096, "epoch": 1.0, "grad_norm": 0.0737379938364029, "learning_rate": 7.083333333333334e-05, "loss": 0.14045721292495728, "mean_token_accuracy": 0.9684210419654846, "num_tokens": 391425.0, "step": 25 }, { "epoch": 1.0, "eval_entropy": 0.9951914588610331, "eval_loss": 0.0682818591594696, "eval_mean_token_accuracy": 0.9826431393623352, "eval_num_tokens": 391425.0, "eval_runtime": 20.701, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 25 }, { "entropy": 1.0287320613861084, "epoch": 1.041237113402062, "grad_norm": 0.047680310904979706, "learning_rate": 6.944444444444444e-05, "loss": 0.12030667811632156, "mean_token_accuracy": 0.9669114947319031, "num_tokens": 408084.0, "step": 26 }, { "epoch": 1.041237113402062, "eval_entropy": 0.9973281343777974, "eval_loss": 0.06792157143354416, "eval_mean_token_accuracy": 0.9827638228734334, "eval_num_tokens": 408084.0, "eval_runtime": 20.7357, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 26 }, { "entropy": 0.951363131403923, "epoch": 1.0824742268041236, "grad_norm": 0.032316166907548904, "learning_rate": 6.805555555555556e-05, "loss": 0.06680411100387573, "mean_token_accuracy": 0.9805278331041336, "num_tokens": 426980.0, "step": 27 }, { "epoch": 1.0824742268041236, "eval_entropy": 0.9922900557518005, "eval_loss": 0.06796544045209885, "eval_mean_token_accuracy": 0.983365794022878, "eval_num_tokens": 426980.0, "eval_runtime": 20.7285, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 27 }, { "entropy": 0.9819868952035904, "epoch": 1.1237113402061856, "grad_norm": 0.04183343052864075, "learning_rate": 6.666666666666667e-05, "loss": 0.05359924957156181, "mean_token_accuracy": 0.9839397221803665, "num_tokens": 444015.0, "step": 28 }, { "epoch": 1.1237113402061856, "eval_entropy": 0.9886756420135498, "eval_loss": 0.06777461618185043, "eval_mean_token_accuracy": 0.9830594817797343, "eval_num_tokens": 444015.0, "eval_runtime": 20.7148, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 28 }, { "entropy": 1.0686464607715607, "epoch": 1.1649484536082475, "grad_norm": 0.06484964489936829, "learning_rate": 6.527777777777778e-05, "loss": 0.09985277056694031, "mean_token_accuracy": 0.9800463020801544, "num_tokens": 457761.0, "step": 29 }, { "epoch": 1.1649484536082475, "eval_entropy": 0.9843578020731608, "eval_loss": 0.0674084722995758, "eval_mean_token_accuracy": 0.9833337426185608, "eval_num_tokens": 457761.0, "eval_runtime": 20.7037, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 29 }, { "entropy": 0.9393526464700699, "epoch": 1.2061855670103092, "grad_norm": 0.034308142960071564, "learning_rate": 6.388888888888888e-05, "loss": 0.06847456097602844, "mean_token_accuracy": 0.9796052426099777, "num_tokens": 472663.0, "step": 30 }, { "epoch": 1.2061855670103092, "eval_entropy": 0.9842836221059164, "eval_loss": 0.06759599596261978, "eval_mean_token_accuracy": 0.9836065967877706, "eval_num_tokens": 472663.0, "eval_runtime": 20.707, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 30 }, { "entropy": 1.0024953335523605, "epoch": 1.2474226804123711, "grad_norm": 0.0696142315864563, "learning_rate": 6.25e-05, "loss": 0.11801434308290482, "mean_token_accuracy": 0.9741000682115555, "num_tokens": 490077.0, "step": 31 }, { "epoch": 1.2474226804123711, "eval_entropy": 0.982773232460022, "eval_loss": 0.06787579506635666, "eval_mean_token_accuracy": 0.9834913015365601, "eval_num_tokens": 490077.0, "eval_runtime": 20.7322, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 31 }, { "entropy": 1.0572933554649353, "epoch": 1.2886597938144329, "grad_norm": 0.04281700775027275, "learning_rate": 6.111111111111112e-05, "loss": 0.09108109772205353, "mean_token_accuracy": 0.978986382484436, "num_tokens": 501496.0, "step": 32 }, { "epoch": 1.2886597938144329, "eval_entropy": 0.9782611966133118, "eval_loss": 0.06708737462759018, "eval_mean_token_accuracy": 0.9841011524200439, "eval_num_tokens": 501496.0, "eval_runtime": 20.7048, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 32 }, { "entropy": 1.0040322095155716, "epoch": 1.3298969072164948, "grad_norm": 0.069253109395504, "learning_rate": 5.972222222222223e-05, "loss": 0.2012481987476349, "mean_token_accuracy": 0.9528108686208725, "num_tokens": 520538.0, "step": 33 }, { "epoch": 1.3298969072164948, "eval_entropy": 0.976926863193512, "eval_loss": 0.06753435730934143, "eval_mean_token_accuracy": 0.9836338480313619, "eval_num_tokens": 520538.0, "eval_runtime": 20.7147, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 33 }, { "entropy": 0.9550057649612427, "epoch": 1.3711340206185567, "grad_norm": 0.035315074026584625, "learning_rate": 5.833333333333334e-05, "loss": 0.02489563450217247, "mean_token_accuracy": 0.7439251840114594, "num_tokens": 539673.0, "step": 34 }, { "epoch": 1.3711340206185567, "eval_entropy": 0.9790456970532735, "eval_loss": 0.0665697529911995, "eval_mean_token_accuracy": 0.9837995886802673, "eval_num_tokens": 539673.0, "eval_runtime": 20.706, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 34 }, { "entropy": 0.9477995336055756, "epoch": 1.4123711340206184, "grad_norm": 0.031360119581222534, "learning_rate": 5.6944444444444445e-05, "loss": 0.05922059714794159, "mean_token_accuracy": 0.9858599007129669, "num_tokens": 555451.0, "step": 35 }, { "epoch": 1.4123711340206184, "eval_entropy": 0.9793919205665589, "eval_loss": 0.06644915044307709, "eval_mean_token_accuracy": 0.9836917479832967, "eval_num_tokens": 555451.0, "eval_runtime": 20.7392, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 35 }, { "entropy": 1.0259694159030914, "epoch": 1.4536082474226804, "grad_norm": 0.029859432950615883, "learning_rate": 5.555555555555556e-05, "loss": 0.059510719031095505, "mean_token_accuracy": 0.986285462975502, "num_tokens": 576966.0, "step": 36 }, { "epoch": 1.4536082474226804, "eval_entropy": 0.9714203874270121, "eval_loss": 0.06650324165821075, "eval_mean_token_accuracy": 0.9838468551635742, "eval_num_tokens": 576966.0, "eval_runtime": 20.7245, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 36 }, { "entropy": 0.9139361828565598, "epoch": 1.4948453608247423, "grad_norm": 0.02590501494705677, "learning_rate": 5.4166666666666664e-05, "loss": 0.0428544282913208, "mean_token_accuracy": 0.9883403331041336, "num_tokens": 590269.0, "step": 37 }, { "epoch": 1.4948453608247423, "eval_entropy": 0.9684606234232584, "eval_loss": 0.06649193167686462, "eval_mean_token_accuracy": 0.984057362874349, "eval_num_tokens": 590269.0, "eval_runtime": 20.7587, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 37 }, { "entropy": 1.0228834748268127, "epoch": 1.536082474226804, "grad_norm": 0.04240834340453148, "learning_rate": 5.2777777777777784e-05, "loss": 0.07022501528263092, "mean_token_accuracy": 0.9773621559143066, "num_tokens": 602225.0, "step": 38 }, { "epoch": 1.536082474226804, "eval_entropy": 0.9702978054682414, "eval_loss": 0.06602076441049576, "eval_mean_token_accuracy": 0.9841559131940206, "eval_num_tokens": 602225.0, "eval_runtime": 20.7433, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 38 }, { "entropy": 0.9485574662685394, "epoch": 1.577319587628866, "grad_norm": 0.04254386946558952, "learning_rate": 5.138888888888889e-05, "loss": 0.06380658596754074, "mean_token_accuracy": 0.9830654263496399, "num_tokens": 617294.0, "step": 39 }, { "epoch": 1.577319587628866, "eval_entropy": 0.966978398958842, "eval_loss": 0.06652393192052841, "eval_mean_token_accuracy": 0.9835621436436971, "eval_num_tokens": 617294.0, "eval_runtime": 20.7635, "eval_samples_per_second": 0.722, "eval_steps_per_second": 0.722, "step": 39 }, { "entropy": 0.9906637221574783, "epoch": 1.6185567010309279, "grad_norm": 0.040576934814453125, "learning_rate": 5e-05, "loss": 0.07286536693572998, "mean_token_accuracy": 0.9796862304210663, "num_tokens": 633138.0, "step": 40 }, { "epoch": 1.6185567010309279, "eval_entropy": 0.96572847366333, "eval_loss": 0.06656248867511749, "eval_mean_token_accuracy": 0.9839033047358196, "eval_num_tokens": 633138.0, "eval_runtime": 20.7125, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 40 }, { "entropy": 1.0376707464456558, "epoch": 1.6597938144329896, "grad_norm": 0.03729933872818947, "learning_rate": 4.8611111111111115e-05, "loss": 0.06184152886271477, "mean_token_accuracy": 0.9809065908193588, "num_tokens": 646631.0, "step": 41 }, { "epoch": 1.6597938144329896, "eval_entropy": 0.965199089050293, "eval_loss": 0.06638391315937042, "eval_mean_token_accuracy": 0.9839538494745891, "eval_num_tokens": 646631.0, "eval_runtime": 20.7908, "eval_samples_per_second": 0.721, "eval_steps_per_second": 0.721, "step": 41 }, { "entropy": 0.8635133057832718, "epoch": 1.7010309278350515, "grad_norm": 0.03243474289774895, "learning_rate": 4.722222222222222e-05, "loss": 0.05577832832932472, "mean_token_accuracy": 0.9830382317304611, "num_tokens": 666468.0, "step": 42 }, { "epoch": 1.7010309278350515, "eval_entropy": 0.9654537439346313, "eval_loss": 0.06642203032970428, "eval_mean_token_accuracy": 0.9833759824434917, "eval_num_tokens": 666468.0, "eval_runtime": 20.748, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 42 }, { "entropy": 0.9067096710205078, "epoch": 1.7422680412371134, "grad_norm": 0.05539465695619583, "learning_rate": 4.5833333333333334e-05, "loss": 0.08047321438789368, "mean_token_accuracy": 0.9781613945960999, "num_tokens": 682772.0, "step": 43 }, { "epoch": 1.7422680412371134, "eval_entropy": 0.9598313728968303, "eval_loss": 0.06654039770364761, "eval_mean_token_accuracy": 0.9835031549135844, "eval_num_tokens": 682772.0, "eval_runtime": 20.7432, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 43 }, { "entropy": 0.9436471909284592, "epoch": 1.7835051546391751, "grad_norm": 0.059121616184711456, "learning_rate": 4.4444444444444447e-05, "loss": 0.042895037680864334, "mean_token_accuracy": 0.9919675588607788, "num_tokens": 696769.0, "step": 44 }, { "epoch": 1.7835051546391751, "eval_entropy": 0.9644124428431193, "eval_loss": 0.06594374030828476, "eval_mean_token_accuracy": 0.9840018550554911, "eval_num_tokens": 696769.0, "eval_runtime": 20.7587, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 44 }, { "entropy": 1.0255336910486221, "epoch": 1.824742268041237, "grad_norm": 0.05181759223341942, "learning_rate": 4.305555555555556e-05, "loss": 0.05620555207133293, "mean_token_accuracy": 0.9859640300273895, "num_tokens": 710603.0, "step": 45 }, { "epoch": 1.824742268041237, "eval_entropy": 0.9609371066093445, "eval_loss": 0.0661281868815422, "eval_mean_token_accuracy": 0.9831904411315918, "eval_num_tokens": 710603.0, "eval_runtime": 20.7077, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 45 }, { "entropy": 1.0243400931358337, "epoch": 1.865979381443299, "grad_norm": 0.05008361488580704, "learning_rate": 4.166666666666667e-05, "loss": 0.10409370064735413, "mean_token_accuracy": 0.9660573154687881, "num_tokens": 723842.0, "step": 46 }, { "epoch": 1.865979381443299, "eval_entropy": 0.9625437498092652, "eval_loss": 0.06587561219930649, "eval_mean_token_accuracy": 0.983132266998291, "eval_num_tokens": 723842.0, "eval_runtime": 20.7346, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.723, "step": 46 }, { "entropy": 0.940504863858223, "epoch": 1.9072164948453607, "grad_norm": 0.039519134908914566, "learning_rate": 4.027777777777778e-05, "loss": 0.0706515908241272, "mean_token_accuracy": 0.9757581502199173, "num_tokens": 739722.0, "step": 47 }, { "epoch": 1.9072164948453607, "eval_entropy": 0.9595511476198832, "eval_loss": 0.0659380629658699, "eval_mean_token_accuracy": 0.9841223756472269, "eval_num_tokens": 739722.0, "eval_runtime": 20.71, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 47 }, { "entropy": 0.9288730025291443, "epoch": 1.9484536082474226, "grad_norm": 0.04865194484591484, "learning_rate": 3.888888888888889e-05, "loss": 0.07841087132692337, "mean_token_accuracy": 0.7290582060813904, "num_tokens": 761217.0, "step": 48 }, { "epoch": 1.9484536082474226, "eval_entropy": 0.9586830457051595, "eval_loss": 0.06554747372865677, "eval_mean_token_accuracy": 0.9832711418469747, "eval_num_tokens": 761217.0, "eval_runtime": 20.726, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 48 }, { "entropy": 0.9199479967355728, "epoch": 1.9896907216494846, "grad_norm": 0.029802290722727776, "learning_rate": 3.7500000000000003e-05, "loss": 0.049351807683706284, "mean_token_accuracy": 0.9868962615728378, "num_tokens": 777316.0, "step": 49 }, { "epoch": 1.9896907216494846, "eval_entropy": 0.9603285034497578, "eval_loss": 0.06658131629228592, "eval_mean_token_accuracy": 0.9836367726325989, "eval_num_tokens": 777316.0, "eval_runtime": 20.7752, "eval_samples_per_second": 0.722, "eval_steps_per_second": 0.722, "step": 49 }, { "entropy": 0.8757496476173401, "epoch": 2.0, "grad_norm": 0.11461242288351059, "learning_rate": 3.611111111111111e-05, "loss": 0.0841030701994896, "mean_token_accuracy": 0.9775280952453613, "num_tokens": 782850.0, "step": 50 }, { "epoch": 2.0, "eval_entropy": 0.9585678974787394, "eval_loss": 0.06587841361761093, "eval_mean_token_accuracy": 0.9834277669588725, "eval_num_tokens": 782850.0, "eval_runtime": 20.6962, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.725, "step": 50 }, { "entropy": 0.9463829398155212, "epoch": 2.0412371134020617, "grad_norm": 0.04902574419975281, "learning_rate": 3.472222222222222e-05, "loss": 0.15842466056346893, "mean_token_accuracy": 0.9630147218704224, "num_tokens": 803046.0, "step": 51 }, { "epoch": 2.0412371134020617, "eval_entropy": 0.9597042759259542, "eval_loss": 0.06541614979505539, "eval_mean_token_accuracy": 0.9843029538790385, "eval_num_tokens": 803046.0, "eval_runtime": 20.7117, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.724, "step": 51 }, { "entropy": 0.9610249996185303, "epoch": 2.082474226804124, "grad_norm": 0.030446451157331467, "learning_rate": 3.3333333333333335e-05, "loss": 0.05591355264186859, "mean_token_accuracy": 0.9813788533210754, "num_tokens": 819050.0, "step": 52 }, { "epoch": 2.082474226804124, "eval_entropy": 0.9546857873598734, "eval_loss": 0.06584987044334412, "eval_mean_token_accuracy": 0.9835208853085836, "eval_num_tokens": 819050.0, "eval_runtime": 20.8142, "eval_samples_per_second": 0.721, "eval_steps_per_second": 0.721, "step": 52 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4030884021180826e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }