{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9860565441799554, "eval_steps": 50, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012325706802249441, "grad_norm": 7.452893819977444, "learning_rate": 2.1951219512195125e-06, "loss": 2.0453, "step": 10 }, { "epoch": 0.024651413604498882, "grad_norm": 2.3543037492581287, "learning_rate": 4.634146341463416e-06, "loss": 1.3516, "step": 20 }, { "epoch": 0.036977120406748325, "grad_norm": 2.2262236440888494, "learning_rate": 7.0731707317073175e-06, "loss": 1.1591, "step": 30 }, { "epoch": 0.049302827208997764, "grad_norm": 2.0267175449632946, "learning_rate": 9.51219512195122e-06, "loss": 1.1018, "step": 40 }, { "epoch": 0.06162853401124721, "grad_norm": 2.298382109468873, "learning_rate": 9.997343728181266e-06, "loss": 1.0783, "step": 50 }, { "epoch": 0.06162853401124721, "eval_loss": 1.534080982208252, "eval_runtime": 70.787, "eval_samples_per_second": 7.063, "eval_steps_per_second": 0.89, "step": 50 }, { "epoch": 0.07395424081349665, "grad_norm": 3.230801767778123, "learning_rate": 9.986557460803527e-06, "loss": 1.0559, "step": 60 }, { "epoch": 0.08627994761574609, "grad_norm": 1.8856823149725142, "learning_rate": 9.967493073239729e-06, "loss": 1.0286, "step": 70 }, { "epoch": 0.09860565441799553, "grad_norm": 1.696852635130396, "learning_rate": 9.940182214035537e-06, "loss": 1.0162, "step": 80 }, { "epoch": 0.11093136122024497, "grad_norm": 1.7949876586725633, "learning_rate": 9.90467022159859e-06, "loss": 1.0129, "step": 90 }, { "epoch": 0.12325706802249442, "grad_norm": 1.7820426803273184, "learning_rate": 9.86101604893279e-06, "loss": 1.0022, "step": 100 }, { "epoch": 0.12325706802249442, "eval_loss": 1.546925663948059, "eval_runtime": 70.866, "eval_samples_per_second": 7.056, "eval_steps_per_second": 0.889, "step": 100 }, { "epoch": 0.13558277482474385, "grad_norm": 1.7974537943774538, "learning_rate": 9.809292165771172e-06, "loss": 0.9944, "step": 110 }, { "epoch": 0.1479084816269933, "grad_norm": 1.6482964944288594, "learning_rate": 9.749584438269833e-06, "loss": 0.9855, "step": 120 }, { "epoch": 0.16023418842924275, "grad_norm": 1.6133070040595134, "learning_rate": 9.681991986462616e-06, "loss": 0.9781, "step": 130 }, { "epoch": 0.17255989523149218, "grad_norm": 1.593902996033425, "learning_rate": 9.60662701971322e-06, "loss": 0.9649, "step": 140 }, { "epoch": 0.18488560203374163, "grad_norm": 1.5942718111544707, "learning_rate": 9.523614650437876e-06, "loss": 0.9599, "step": 150 }, { "epoch": 0.18488560203374163, "eval_loss": 2.1210761070251465, "eval_runtime": 69.8713, "eval_samples_per_second": 7.156, "eval_steps_per_second": 0.902, "step": 150 }, { "epoch": 0.19721130883599106, "grad_norm": 1.9243309107162623, "learning_rate": 9.43309268640781e-06, "loss": 0.9506, "step": 160 }, { "epoch": 0.2095370156382405, "grad_norm": 1.6597331657353593, "learning_rate": 9.335211401976348e-06, "loss": 0.9407, "step": 170 }, { "epoch": 0.22186272244048993, "grad_norm": 1.5608150853931217, "learning_rate": 9.230133288610366e-06, "loss": 0.9411, "step": 180 }, { "epoch": 0.2341884292427394, "grad_norm": 1.659963458100064, "learning_rate": 9.118032785140305e-06, "loss": 0.9241, "step": 190 }, { "epoch": 0.24651413604498884, "grad_norm": 1.564034296830137, "learning_rate": 8.999095988176485e-06, "loss": 0.9223, "step": 200 }, { "epoch": 0.24651413604498884, "eval_loss": 1.790003776550293, "eval_runtime": 69.9162, "eval_samples_per_second": 7.151, "eval_steps_per_second": 0.901, "step": 200 }, { "epoch": 0.25883984284723827, "grad_norm": 1.6906053171178657, "learning_rate": 8.87352034317252e-06, "loss": 0.923, "step": 210 }, { "epoch": 0.2711655496494877, "grad_norm": 1.6922014468370863, "learning_rate": 8.741514316648643e-06, "loss": 0.91, "step": 220 }, { "epoch": 0.2834912564517372, "grad_norm": 1.6951580245360987, "learning_rate": 8.6032970501191e-06, "loss": 0.9145, "step": 230 }, { "epoch": 0.2958169632539866, "grad_norm": 1.5524164632446658, "learning_rate": 8.459097996298137e-06, "loss": 0.8962, "step": 240 }, { "epoch": 0.308142670056236, "grad_norm": 1.6333763395450898, "learning_rate": 8.309156538188483e-06, "loss": 0.9043, "step": 250 }, { "epoch": 0.308142670056236, "eval_loss": 1.9895975589752197, "eval_runtime": 69.9633, "eval_samples_per_second": 7.147, "eval_steps_per_second": 0.9, "step": 250 }, { "epoch": 0.3204683768584855, "grad_norm": 1.741781393236487, "learning_rate": 8.153721591684691e-06, "loss": 0.9016, "step": 260 }, { "epoch": 0.33279408366073493, "grad_norm": 3.492669448448843, "learning_rate": 7.993051192351056e-06, "loss": 0.8874, "step": 270 }, { "epoch": 0.34511979046298435, "grad_norm": 2.158019407592712, "learning_rate": 7.82741206706007e-06, "loss": 0.8833, "step": 280 }, { "epoch": 0.3574454972652338, "grad_norm": 1.8883690282433314, "learning_rate": 7.65707919120256e-06, "loss": 0.8799, "step": 290 }, { "epoch": 0.36977120406748326, "grad_norm": 1.7331682143158789, "learning_rate": 7.482335332204568e-06, "loss": 0.8723, "step": 300 }, { "epoch": 0.36977120406748326, "eval_loss": 2.037771463394165, "eval_runtime": 69.6082, "eval_samples_per_second": 7.183, "eval_steps_per_second": 0.905, "step": 300 }, { "epoch": 0.3820969108697327, "grad_norm": 7.321572264696613, "learning_rate": 7.303470580108756e-06, "loss": 0.8643, "step": 310 }, { "epoch": 0.3944226176719821, "grad_norm": 1.8359969887536733, "learning_rate": 7.120781865999655e-06, "loss": 0.8703, "step": 320 }, { "epoch": 0.4067483244742316, "grad_norm": 1.5137277753009553, "learning_rate": 6.934572469072163e-06, "loss": 0.8619, "step": 330 }, { "epoch": 0.419074031276481, "grad_norm": 1.5838832729805055, "learning_rate": 6.745151513161644e-06, "loss": 0.854, "step": 340 }, { "epoch": 0.43139973807873044, "grad_norm": 1.5482465248384414, "learning_rate": 6.552833453571402e-06, "loss": 0.8597, "step": 350 }, { "epoch": 0.43139973807873044, "eval_loss": 2.191089391708374, "eval_runtime": 69.1769, "eval_samples_per_second": 7.228, "eval_steps_per_second": 0.911, "step": 350 }, { "epoch": 0.44372544488097987, "grad_norm": 1.5361828735502951, "learning_rate": 6.357937555049465e-06, "loss": 0.8472, "step": 360 }, { "epoch": 0.45605115168322935, "grad_norm": 1.5888508157327772, "learning_rate": 6.1607873617812555e-06, "loss": 0.8383, "step": 370 }, { "epoch": 0.4683768584854788, "grad_norm": 1.855784871619504, "learning_rate": 5.961710160278042e-06, "loss": 0.8408, "step": 380 }, { "epoch": 0.4807025652877282, "grad_norm": 1.581873169717113, "learning_rate": 5.761036436052788e-06, "loss": 0.8387, "step": 390 }, { "epoch": 0.4930282720899777, "grad_norm": 1.4854066280222107, "learning_rate": 5.559099324985381e-06, "loss": 0.8313, "step": 400 }, { "epoch": 0.4930282720899777, "eval_loss": 2.278899669647217, "eval_runtime": 68.8551, "eval_samples_per_second": 7.262, "eval_steps_per_second": 0.915, "step": 400 }, { "epoch": 0.5053539788922271, "grad_norm": 1.6017900575388742, "learning_rate": 5.356234060288029e-06, "loss": 0.818, "step": 410 }, { "epoch": 0.5176796856944765, "grad_norm": 2.7089731695917565, "learning_rate": 5.152777415988894e-06, "loss": 0.8237, "step": 420 }, { "epoch": 0.530005392496726, "grad_norm": 1.5583211088607285, "learning_rate": 4.949067147857846e-06, "loss": 0.8125, "step": 430 }, { "epoch": 0.5423310992989754, "grad_norm": 1.6393244511711984, "learning_rate": 4.745441432702425e-06, "loss": 0.8067, "step": 440 }, { "epoch": 0.5546568061012249, "grad_norm": 1.6029455569913715, "learning_rate": 4.542238306964863e-06, "loss": 0.8045, "step": 450 }, { "epoch": 0.5546568061012249, "eval_loss": 2.5131213665008545, "eval_runtime": 68.9289, "eval_samples_per_second": 7.254, "eval_steps_per_second": 0.914, "step": 450 }, { "epoch": 0.5669825129034743, "grad_norm": 1.7346272402790566, "learning_rate": 4.33979510555211e-06, "loss": 0.8022, "step": 460 }, { "epoch": 0.5793082197057238, "grad_norm": 1.5783810238803107, "learning_rate": 4.1384479018304735e-06, "loss": 0.7937, "step": 470 }, { "epoch": 0.5916339265079732, "grad_norm": 1.7378549278683895, "learning_rate": 3.938530949714533e-06, "loss": 0.7949, "step": 480 }, { "epoch": 0.6039596333102226, "grad_norm": 1.500102158637519, "learning_rate": 3.740376128776486e-06, "loss": 0.7924, "step": 490 }, { "epoch": 0.616285340112472, "grad_norm": 1.4830818128420995, "learning_rate": 3.544312393297133e-06, "loss": 0.7787, "step": 500 }, { "epoch": 0.616285340112472, "eval_loss": 2.649029493331909, "eval_runtime": 68.7281, "eval_samples_per_second": 7.275, "eval_steps_per_second": 0.917, "step": 500 }, { "epoch": 0.6286110469147215, "grad_norm": 1.7607503841673393, "learning_rate": 3.350665226173078e-06, "loss": 0.7742, "step": 510 }, { "epoch": 0.640936753716971, "grad_norm": 1.6547014256761263, "learning_rate": 3.1597560985867393e-06, "loss": 0.7791, "step": 520 }, { "epoch": 0.6532624605192204, "grad_norm": 1.7457233774898238, "learning_rate": 2.971901936336172e-06, "loss": 0.776, "step": 530 }, { "epoch": 0.6655881673214699, "grad_norm": 1.6008703103342163, "learning_rate": 2.787414593710583e-06, "loss": 0.772, "step": 540 }, { "epoch": 0.6779138741237193, "grad_norm": 1.5212358385306883, "learning_rate": 2.6066003357850426e-06, "loss": 0.7605, "step": 550 }, { "epoch": 0.6779138741237193, "eval_loss": 2.666884660720825, "eval_runtime": 68.9476, "eval_samples_per_second": 7.252, "eval_steps_per_second": 0.914, "step": 550 }, { "epoch": 0.6902395809259687, "grad_norm": 1.7501370825549742, "learning_rate": 2.4297593299937456e-06, "loss": 0.7713, "step": 560 }, { "epoch": 0.7025652877282181, "grad_norm": 1.7933479321839014, "learning_rate": 2.2571851478258903e-06, "loss": 0.7557, "step": 570 }, { "epoch": 0.7148909945304676, "grad_norm": 1.5453439829194566, "learning_rate": 2.089164277471425e-06, "loss": 0.7589, "step": 580 }, { "epoch": 0.7272167013327171, "grad_norm": 1.4847139112302161, "learning_rate": 1.9259756482256526e-06, "loss": 0.7483, "step": 590 }, { "epoch": 0.7395424081349665, "grad_norm": 1.5573829581447882, "learning_rate": 1.7678901674423044e-06, "loss": 0.7515, "step": 600 }, { "epoch": 0.7395424081349665, "eval_loss": 2.6441097259521484, "eval_runtime": 69.5513, "eval_samples_per_second": 7.189, "eval_steps_per_second": 0.906, "step": 600 }, { "epoch": 0.751868114937216, "grad_norm": 1.5376946639446236, "learning_rate": 1.6151702708036837e-06, "loss": 0.7508, "step": 610 }, { "epoch": 0.7641938217394654, "grad_norm": 2.3042535835655324, "learning_rate": 1.4680694866545708e-06, "loss": 0.7494, "step": 620 }, { "epoch": 0.7765195285417148, "grad_norm": 2.5286785030275896, "learning_rate": 1.3268320151230518e-06, "loss": 0.7491, "step": 630 }, { "epoch": 0.7888452353439642, "grad_norm": 1.4854931787213466, "learning_rate": 1.1916923227270228e-06, "loss": 0.7425, "step": 640 }, { "epoch": 0.8011709421462136, "grad_norm": 1.5522255076883127, "learning_rate": 1.0628747531393202e-06, "loss": 0.7378, "step": 650 }, { "epoch": 0.8011709421462136, "eval_loss": 2.834235668182373, "eval_runtime": 69.1095, "eval_samples_per_second": 7.235, "eval_steps_per_second": 0.912, "step": 650 }, { "epoch": 0.8134966489484632, "grad_norm": 1.9794301258828817, "learning_rate": 9.405931547576591e-07, "loss": 0.7423, "step": 660 }, { "epoch": 0.8258223557507126, "grad_norm": 1.6396359308051802, "learning_rate": 8.250505256976565e-07, "loss": 0.7403, "step": 670 }, { "epoch": 0.838148062552962, "grad_norm": 1.4906580462063472, "learning_rate": 7.164386767982523e-07, "loss": 0.7429, "step": 680 }, { "epoch": 0.8504737693552115, "grad_norm": 1.5611142552447048, "learning_rate": 6.14937913198988e-07, "loss": 0.7429, "step": 690 }, { "epoch": 0.8627994761574609, "grad_norm": 1.7252809241954563, "learning_rate": 5.207167350177639e-07, "loss": 0.7358, "step": 700 }, { "epoch": 0.8627994761574609, "eval_loss": 2.906477689743042, "eval_runtime": 68.7423, "eval_samples_per_second": 7.274, "eval_steps_per_second": 0.916, "step": 700 }, { "epoch": 0.8751251829597103, "grad_norm": 1.5433767313992226, "learning_rate": 4.3393155762594343e-07, "loss": 0.7254, "step": 710 }, { "epoch": 0.8874508897619597, "grad_norm": 1.61172583739597, "learning_rate": 3.5472645198520064e-07, "loss": 0.7355, "step": 720 }, { "epoch": 0.8997765965642093, "grad_norm": 1.5245210157154765, "learning_rate": 2.832329054771593e-07, "loss": 0.7354, "step": 730 }, { "epoch": 0.9121023033664587, "grad_norm": 1.5213369145888322, "learning_rate": 2.1956960362287726e-07, "loss": 0.7374, "step": 740 }, { "epoch": 0.9244280101687081, "grad_norm": 1.6540245112281797, "learning_rate": 1.6384223305453417e-07, "loss": 0.7255, "step": 750 }, { "epoch": 0.9244280101687081, "eval_loss": 2.8892390727996826, "eval_runtime": 68.7732, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.916, "step": 750 }, { "epoch": 0.9367537169709576, "grad_norm": 1.5276205250223267, "learning_rate": 1.1614330606639912e-07, "loss": 0.7323, "step": 760 }, { "epoch": 0.949079423773207, "grad_norm": 1.6893602703459814, "learning_rate": 7.655200703636623e-08, "loss": 0.7315, "step": 770 }, { "epoch": 0.9614051305754564, "grad_norm": 1.501672320168789, "learning_rate": 4.513406097297224e-08, "loss": 0.7233, "step": 780 }, { "epoch": 0.9737308373777058, "grad_norm": 1.6616486025520198, "learning_rate": 2.194162440616099e-08, "loss": 0.7226, "step": 790 }, { "epoch": 0.9860565441799554, "grad_norm": 1.81257795155879, "learning_rate": 7.01319880288931e-09, "loss": 0.7365, "step": 800 }, { "epoch": 0.9860565441799554, "eval_loss": 2.904540777206421, "eval_runtime": 68.6952, "eval_samples_per_second": 7.279, "eval_steps_per_second": 0.917, "step": 800 } ], "logging_steps": 10, "max_steps": 812, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2177974140928000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }