{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.066477070742979, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017774617845716316, "grad_norm": 0.35813066363334656, "learning_rate": 4.3516873889875667e-07, "loss": 2.0692, "mean_token_accuracy": 0.6023900508880615, "num_tokens": 204800.0, "step": 50 }, { "epoch": 0.03554923569143263, "grad_norm": 0.32111021876335144, "learning_rate": 8.792184724689167e-07, "loss": 2.0543, "mean_token_accuracy": 0.6035288572311401, "num_tokens": 409600.0, "step": 100 }, { "epoch": 0.053323853537148955, "grad_norm": 0.32630783319473267, "learning_rate": 1.3232682060390764e-06, "loss": 2.0209, "mean_token_accuracy": 0.6055229902267456, "num_tokens": 614400.0, "step": 150 }, { "epoch": 0.07109847138286526, "grad_norm": 0.28461402654647827, "learning_rate": 1.7673179396092362e-06, "loss": 1.9593, "mean_token_accuracy": 0.6079961061477661, "num_tokens": 819200.0, "step": 200 }, { "epoch": 0.08887308922858159, "grad_norm": 0.26054471731185913, "learning_rate": 2.211367673179396e-06, "loss": 1.856, "mean_token_accuracy": 0.619281530380249, "num_tokens": 1024000.0, "step": 250 }, { "epoch": 0.10664770707429791, "grad_norm": 0.2539742588996887, "learning_rate": 2.6554174067495562e-06, "loss": 1.7108, "mean_token_accuracy": 0.6380449533462524, "num_tokens": 1228800.0, "step": 300 }, { "epoch": 0.12442232492001422, "grad_norm": 0.2737816274166107, "learning_rate": 3.0994671403197163e-06, "loss": 1.5271, "mean_token_accuracy": 0.6642766416072845, "num_tokens": 1433600.0, "step": 350 }, { "epoch": 0.14219694276573053, "grad_norm": 0.4782876968383789, "learning_rate": 3.543516873889876e-06, "loss": 1.2194, "mean_token_accuracy": 0.7056109464168548, "num_tokens": 1638400.0, "step": 400 }, { "epoch": 0.15997156061144685, "grad_norm": 0.33332306146621704, "learning_rate": 3.987566607460036e-06, "loss": 0.5046, "mean_token_accuracy": 0.8682502424716949, "num_tokens": 1843200.0, "step": 450 }, { "epoch": 0.17774617845716317, "grad_norm": 0.07436951249837875, "learning_rate": 4.431616341030196e-06, "loss": 0.0607, "mean_token_accuracy": 0.9876881837844849, "num_tokens": 2048000.0, "step": 500 }, { "epoch": 0.1955207963028795, "grad_norm": 0.0204151701182127, "learning_rate": 4.875666074600356e-06, "loss": 0.0215, "mean_token_accuracy": 0.9962121248245239, "num_tokens": 2252800.0, "step": 550 }, { "epoch": 0.21329541414859582, "grad_norm": 0.02164212055504322, "learning_rate": 4.999376292672713e-06, "loss": 0.0167, "mean_token_accuracy": 0.9970674514770508, "num_tokens": 2457600.0, "step": 600 }, { "epoch": 0.23107003199431211, "grad_norm": 0.020050790160894394, "learning_rate": 4.996441329770864e-06, "loss": 0.0139, "mean_token_accuracy": 0.9980645179748535, "num_tokens": 2662400.0, "step": 650 }, { "epoch": 0.24884464984002844, "grad_norm": 0.01833987981081009, "learning_rate": 4.991103606682987e-06, "loss": 0.0114, "mean_token_accuracy": 0.9990224838256836, "num_tokens": 2867200.0, "step": 700 }, { "epoch": 0.26661926768574473, "grad_norm": 0.017948873341083527, "learning_rate": 4.983368260829447e-06, "loss": 0.0097, "mean_token_accuracy": 0.9990224838256836, "num_tokens": 3072000.0, "step": 750 }, { "epoch": 0.28439388553146105, "grad_norm": 0.016040902584791183, "learning_rate": 4.973242737280479e-06, "loss": 0.0088, "mean_token_accuracy": 0.9990224838256836, "num_tokens": 3276800.0, "step": 800 }, { "epoch": 0.3021685033771774, "grad_norm": 0.019489038735628128, "learning_rate": 4.9607367815905074e-06, "loss": 0.0077, "mean_token_accuracy": 0.9990224838256836, "num_tokens": 3481600.0, "step": 850 }, { "epoch": 0.3199431212228937, "grad_norm": 0.023746857419610023, "learning_rate": 4.9458624304183e-06, "loss": 0.0066, "mean_token_accuracy": 0.9990224838256836, "num_tokens": 3686400.0, "step": 900 }, { "epoch": 0.33771773906861, "grad_norm": 0.02442137524485588, "learning_rate": 4.928633999941995e-06, "loss": 0.0052, "mean_token_accuracy": 0.9994232654571533, "num_tokens": 3891200.0, "step": 950 }, { "epoch": 0.35549235691432635, "grad_norm": 0.024948744103312492, "learning_rate": 4.909068072080152e-06, "loss": 0.0033, "mean_token_accuracy": 0.9999169111251831, "num_tokens": 4096000.0, "step": 1000 }, { "epoch": 0.37326697476004267, "grad_norm": 0.010364473797380924, "learning_rate": 4.887183478532081e-06, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 4300800.0, "step": 1050 }, { "epoch": 0.391041592605759, "grad_norm": 0.004597879946231842, "learning_rate": 4.86300128265282e-06, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 4505600.0, "step": 1100 }, { "epoch": 0.4088162104514753, "grad_norm": 0.0024459899868816137, "learning_rate": 4.836544759180206e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 4710400.0, "step": 1150 }, { "epoch": 0.42659082829719164, "grad_norm": 0.0013584563275799155, "learning_rate": 4.807839371833534e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 4915200.0, "step": 1200 }, { "epoch": 0.4443654461429079, "grad_norm": 0.0022760790307074785, "learning_rate": 4.776912748805392e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 5120000.0, "step": 1250 }, { "epoch": 0.46214006398862423, "grad_norm": 0.0010678465478122234, "learning_rate": 4.743794656170235e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 5324800.0, "step": 1300 }, { "epoch": 0.47991468183434055, "grad_norm": 0.0008997164550237358, "learning_rate": 4.7085169692353175e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 5529600.0, "step": 1350 }, { "epoch": 0.4976892996800569, "grad_norm": 0.0007172881159931421, "learning_rate": 4.671113641861523e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 5734400.0, "step": 1400 }, { "epoch": 0.5154639175257731, "grad_norm": 0.0006771351909264922, "learning_rate": 4.631620673783662e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 5939200.0, "step": 1450 }, { "epoch": 0.5332385353714895, "grad_norm": 0.0018679159693419933, "learning_rate": 4.590076075961653e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 6144000.0, "step": 1500 }, { "epoch": 0.5510131532172058, "grad_norm": 0.0005585107719525695, "learning_rate": 4.546519833995959e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 6348800.0, "step": 1550 }, { "epoch": 0.5687877710629221, "grad_norm": 0.0008035791106522083, "learning_rate": 4.5009938696424816e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 6553600.0, "step": 1600 }, { "epoch": 0.5865623889086384, "grad_norm": 0.0005080197588540614, "learning_rate": 4.453542000463954e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 6758400.0, "step": 1650 }, { "epoch": 0.6043370067543548, "grad_norm": 0.0005295136361382902, "learning_rate": 4.404209897656672e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 6963200.0, "step": 1700 }, { "epoch": 0.6221116246000711, "grad_norm": 0.0004332393582444638, "learning_rate": 4.353045042093153e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 7168000.0, "step": 1750 }, { "epoch": 0.6398862424457874, "grad_norm": 0.0006319705280475318, "learning_rate": 4.3000966786230145e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 7372800.0, "step": 1800 }, { "epoch": 0.6576608602915037, "grad_norm": 0.0003928539517801255, "learning_rate": 4.245415768676092e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 7577600.0, "step": 1850 }, { "epoch": 0.67543547813722, "grad_norm": 0.00046053010737523437, "learning_rate": 4.189054941213374e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 7782400.0, "step": 1900 }, { "epoch": 0.6932100959829364, "grad_norm": 0.0005214207340031862, "learning_rate": 4.131068442072993e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 7987200.0, "step": 1950 }, { "epoch": 0.7109847138286527, "grad_norm": 0.00034946095547638834, "learning_rate": 4.071512081760004e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 8192000.0, "step": 2000 }, { "epoch": 0.728759331674369, "grad_norm": 0.0003215207834728062, "learning_rate": 4.010443181730227e-06, "loss": 0.0001, "mean_token_accuracy": 1.0, "num_tokens": 8396800.0, "step": 2050 }, { "epoch": 0.7465339495200853, "grad_norm": 0.000304091430734843, "learning_rate": 3.947920519219819e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 8601600.0, "step": 2100 }, { "epoch": 0.7643085673658017, "grad_norm": 0.0002895969955716282, "learning_rate": 3.884004270673711e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 8806400.0, "step": 2150 }, { "epoch": 0.782083185211518, "grad_norm": 0.000491043902002275, "learning_rate": 3.8187559538273364e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 9011200.0, "step": 2200 }, { "epoch": 0.7998578030572343, "grad_norm": 0.0006208749837242067, "learning_rate": 3.7522383684973994e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 9216000.0, "step": 2250 }, { "epoch": 0.8176324209029506, "grad_norm": 0.0002920223632827401, "learning_rate": 3.684515536138678e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 9420800.0, "step": 2300 }, { "epoch": 0.835407038748667, "grad_norm": 0.0002676283475011587, "learning_rate": 3.615652638225031e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 9625600.0, "step": 2350 }, { "epoch": 0.8531816565943833, "grad_norm": 0.00026544969296082854, "learning_rate": 3.5457159535139164e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 9830400.0, "step": 2400 }, { "epoch": 0.8709562744400995, "grad_norm": 0.0002422648831270635, "learning_rate": 3.474772794254798e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 10035200.0, "step": 2450 }, { "epoch": 0.8887308922858158, "grad_norm": 0.00023040748783387244, "learning_rate": 3.4028914414028546e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 10240000.0, "step": 2500 }, { "epoch": 0.9065055101315321, "grad_norm": 0.00022141206136438996, "learning_rate": 3.3301410789003196e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 10444800.0, "step": 2550 }, { "epoch": 0.9242801279772485, "grad_norm": 0.00022032092965673655, "learning_rate": 3.256591727088731e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 10649600.0, "step": 2600 }, { "epoch": 0.9420547458229648, "grad_norm": 0.00034698573290370405, "learning_rate": 3.182314175316163e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 10854400.0, "step": 2650 }, { "epoch": 0.9598293636686811, "grad_norm": 0.0002007318107644096, "learning_rate": 3.1073799138043115e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 11059200.0, "step": 2700 }, { "epoch": 0.9776039815143974, "grad_norm": 0.0008882895344868302, "learning_rate": 3.0318610648410037e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 11264000.0, "step": 2750 }, { "epoch": 0.9953785993601137, "grad_norm": 0.00027154191047884524, "learning_rate": 2.955830313364363e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 11468800.0, "step": 2800 }, { "epoch": 1.0131532172058302, "grad_norm": 0.00027565049822442234, "learning_rate": 2.879360837005437e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 11671552.0, "step": 2850 }, { "epoch": 1.0309278350515463, "grad_norm": 0.00022640528914052993, "learning_rate": 2.80252623565662e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 11876352.0, "step": 2900 }, { "epoch": 1.0487024528972626, "grad_norm": 0.00019061467901337892, "learning_rate": 2.725400460633664e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 12081152.0, "step": 2950 }, { "epoch": 1.066477070742979, "grad_norm": 0.00017320447659585625, "learning_rate": 2.648057743499445e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 12285952.0, "step": 3000 } ], "logging_steps": 50, "max_steps": 5626, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.643453362570527e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }