{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 10, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.046511627906976744, "grad_norm": 3.0025534629821777, "learning_rate": 0.0, "loss": 4.5637, "step": 1 }, { "epoch": 0.09302325581395349, "grad_norm": 3.104435920715332, "learning_rate": 2.857142857142857e-05, "loss": 4.627, "step": 2 }, { "epoch": 0.13953488372093023, "grad_norm": 2.6059792041778564, "learning_rate": 5.714285714285714e-05, "loss": 4.3333, "step": 3 }, { "epoch": 0.18604651162790697, "grad_norm": 2.0599193572998047, "learning_rate": 8.571428571428571e-05, "loss": 4.1341, "step": 4 }, { "epoch": 0.23255813953488372, "grad_norm": 1.8155323266983032, "learning_rate": 0.00011428571428571428, "loss": 3.8432, "step": 5 }, { "epoch": 0.27906976744186046, "grad_norm": 1.708072304725647, "learning_rate": 0.00014285714285714287, "loss": 3.5746, "step": 6 }, { "epoch": 0.32558139534883723, "grad_norm": 1.8196609020233154, "learning_rate": 0.00017142857142857143, "loss": 3.2307, "step": 7 }, { "epoch": 0.37209302325581395, "grad_norm": 2.088304281234741, "learning_rate": 0.0002, "loss": 2.841, "step": 8 }, { "epoch": 0.4186046511627907, "grad_norm": 1.9300062656402588, "learning_rate": 0.0001998582695676762, "loss": 2.3523, "step": 9 }, { "epoch": 0.46511627906976744, "grad_norm": 1.7693432569503784, "learning_rate": 0.00019943348002101371, "loss": 2.0571, "step": 10 }, { "epoch": 0.46511627906976744, "eval_loss": 1.9185459613800049, "eval_runtime": 146.224, "eval_samples_per_second": 33.428, "eval_steps_per_second": 0.137, "step": 10 }, { "epoch": 0.5116279069767442, "grad_norm": 2.0438809394836426, "learning_rate": 0.00019872683547213446, "loss": 1.8194, "step": 11 }, { "epoch": 0.5581395348837209, "grad_norm": 1.2566792964935303, "learning_rate": 0.00019774033898178667, "loss": 1.6177, "step": 12 }, { "epoch": 0.6046511627906976, "grad_norm": 1.0408961772918701, "learning_rate": 0.0001964767868814516, "loss": 1.4482, "step": 13 }, { "epoch": 0.6511627906976745, "grad_norm": 1.4391571283340454, "learning_rate": 0.00019493976084683813, "loss": 1.3241, "step": 14 }, { "epoch": 0.6976744186046512, "grad_norm": 1.833454966545105, "learning_rate": 0.00019313361774523385, "loss": 1.1903, "step": 15 }, { "epoch": 0.7441860465116279, "grad_norm": 1.9543718099594116, "learning_rate": 0.00019106347728549135, "loss": 1.0326, "step": 16 }, { "epoch": 0.7906976744186046, "grad_norm": 1.6850959062576294, "learning_rate": 0.00018873520750565718, "loss": 0.8707, "step": 17 }, { "epoch": 0.8372093023255814, "grad_norm": 1.4347357749938965, "learning_rate": 0.0001861554081393806, "loss": 0.6915, "step": 18 }, { "epoch": 0.8837209302325582, "grad_norm": 1.2694600820541382, "learning_rate": 0.0001833313919082515, "loss": 0.516, "step": 19 }, { "epoch": 0.9302325581395349, "grad_norm": 1.1048444509506226, "learning_rate": 0.00018027116379309638, "loss": 0.355, "step": 20 }, { "epoch": 0.9302325581395349, "eval_loss": 0.23854584991931915, "eval_runtime": 145.2399, "eval_samples_per_second": 33.655, "eval_steps_per_second": 0.138, "step": 20 }, { "epoch": 0.9767441860465116, "grad_norm": 0.9502027630805969, "learning_rate": 0.00017698339834299061, "loss": 0.2293, "step": 21 }, { "epoch": 1.0, "grad_norm": 0.6736359000205994, "learning_rate": 0.00017347741508630672, "loss": 0.1293, "step": 22 }, { "epoch": 1.0465116279069768, "grad_norm": 0.34549638628959656, "learning_rate": 0.0001697631521134985, "loss": 0.0697, "step": 23 }, { "epoch": 1.0930232558139534, "grad_norm": 0.173061802983284, "learning_rate": 0.00016585113790650388, "loss": 0.0501, "step": 24 }, { "epoch": 1.1395348837209303, "grad_norm": 0.12201997637748718, "learning_rate": 0.0001617524614946192, "loss": 0.0345, "step": 25 }, { "epoch": 1.1860465116279069, "grad_norm": 0.1040547639131546, "learning_rate": 0.0001574787410214407, "loss": 0.0366, "step": 26 }, { "epoch": 1.2325581395348837, "grad_norm": 0.10368712991476059, "learning_rate": 0.00015304209081197425, "loss": 0.0332, "step": 27 }, { "epoch": 1.2790697674418605, "grad_norm": 0.08539443463087082, "learning_rate": 0.00014845508703326504, "loss": 0.0314, "step": 28 }, { "epoch": 1.3255813953488373, "grad_norm": 0.06456629931926727, "learning_rate": 0.00014373073204588556, "loss": 0.0288, "step": 29 }, { "epoch": 1.372093023255814, "grad_norm": 0.04876594990491867, "learning_rate": 0.00013888241754733208, "loss": 0.0285, "step": 30 }, { "epoch": 1.372093023255814, "eval_loss": 0.03179961442947388, "eval_runtime": 145.5943, "eval_samples_per_second": 33.573, "eval_steps_per_second": 0.137, "step": 30 }, { "epoch": 1.4186046511627908, "grad_norm": 0.04562387242913246, "learning_rate": 0.00013392388661180303, "loss": 0.0287, "step": 31 }, { "epoch": 1.4651162790697674, "grad_norm": 0.05053865909576416, "learning_rate": 0.0001288691947339621, "loss": 0.0268, "step": 32 }, { "epoch": 1.5116279069767442, "grad_norm": 0.0430695004761219, "learning_rate": 0.0001237326699871115, "loss": 0.0271, "step": 33 }, { "epoch": 1.558139534883721, "grad_norm": 0.03730069100856781, "learning_rate": 0.00011852887240871145, "loss": 0.027, "step": 34 }, { "epoch": 1.6046511627906976, "grad_norm": 0.03784715384244919, "learning_rate": 0.00011327255272837221, "loss": 0.0277, "step": 35 }, { "epoch": 1.6511627906976745, "grad_norm": 0.08213754743337631, "learning_rate": 0.00010797861055530831, "loss": 0.0264, "step": 36 }, { "epoch": 1.697674418604651, "grad_norm": 0.043709009885787964, "learning_rate": 0.00010266205214377748, "loss": 0.0277, "step": 37 }, { "epoch": 1.744186046511628, "grad_norm": 0.030792105942964554, "learning_rate": 9.733794785622253e-05, "loss": 0.0253, "step": 38 }, { "epoch": 1.7906976744186047, "grad_norm": 0.03732023760676384, "learning_rate": 9.202138944469168e-05, "loss": 0.024, "step": 39 }, { "epoch": 1.8372093023255816, "grad_norm": 0.035537052899599075, "learning_rate": 8.672744727162781e-05, "loss": 0.0236, "step": 40 }, { "epoch": 1.8372093023255816, "eval_loss": 0.028443168848752975, "eval_runtime": 144.7893, "eval_samples_per_second": 33.759, "eval_steps_per_second": 0.138, "step": 40 }, { "epoch": 1.8837209302325582, "grad_norm": 0.032163042575120926, "learning_rate": 8.147112759128859e-05, "loss": 0.0239, "step": 41 }, { "epoch": 1.9302325581395348, "grad_norm": 0.036109376698732376, "learning_rate": 7.626733001288851e-05, "loss": 0.0233, "step": 42 }, { "epoch": 1.9767441860465116, "grad_norm": 0.04403885081410408, "learning_rate": 7.113080526603792e-05, "loss": 0.0249, "step": 43 }, { "epoch": 2.0, "grad_norm": 0.06689224392175674, "learning_rate": 6.607611338819697e-05, "loss": 0.0275, "step": 44 }, { "epoch": 2.046511627906977, "grad_norm": 0.03163151443004608, "learning_rate": 6.111758245266794e-05, "loss": 0.0223, "step": 45 }, { "epoch": 2.0930232558139537, "grad_norm": 0.024843823164701462, "learning_rate": 5.626926795411447e-05, "loss": 0.022, "step": 46 }, { "epoch": 2.13953488372093, "grad_norm": 0.0263860821723938, "learning_rate": 5.1544912966734994e-05, "loss": 0.0213, "step": 47 }, { "epoch": 2.186046511627907, "grad_norm": 0.02427072636783123, "learning_rate": 4.695790918802576e-05, "loss": 0.022, "step": 48 }, { "epoch": 2.2325581395348837, "grad_norm": 0.037542153149843216, "learning_rate": 4.252125897855932e-05, "loss": 0.0192, "step": 49 }, { "epoch": 2.2790697674418605, "grad_norm": 0.023852059617638588, "learning_rate": 3.824753850538082e-05, "loss": 0.0214, "step": 50 }, { "epoch": 2.2790697674418605, "eval_loss": 0.027282487601041794, "eval_runtime": 145.0463, "eval_samples_per_second": 33.7, "eval_steps_per_second": 0.138, "step": 50 }, { "epoch": 2.3255813953488373, "grad_norm": 0.029498351737856865, "learning_rate": 3.414886209349615e-05, "loss": 0.0208, "step": 51 }, { "epoch": 2.3720930232558137, "grad_norm": 0.02536361664533615, "learning_rate": 3.0236847886501542e-05, "loss": 0.0208, "step": 52 }, { "epoch": 2.4186046511627906, "grad_norm": 0.031641166657209396, "learning_rate": 2.6522584913693294e-05, "loss": 0.0222, "step": 53 }, { "epoch": 2.4651162790697674, "grad_norm": 0.027137573808431625, "learning_rate": 2.301660165700936e-05, "loss": 0.0204, "step": 54 }, { "epoch": 2.511627906976744, "grad_norm": 0.024312356486916542, "learning_rate": 1.9728836206903656e-05, "loss": 0.0189, "step": 55 }, { "epoch": 2.558139534883721, "grad_norm": 0.023400593549013138, "learning_rate": 1.6668608091748495e-05, "loss": 0.0209, "step": 56 }, { "epoch": 2.604651162790698, "grad_norm": 0.0216949712485075, "learning_rate": 1.3844591860619383e-05, "loss": 0.0209, "step": 57 }, { "epoch": 2.6511627906976747, "grad_norm": 0.022243639454245567, "learning_rate": 1.1264792494342857e-05, "loss": 0.0199, "step": 58 }, { "epoch": 2.697674418604651, "grad_norm": 0.02433605305850506, "learning_rate": 8.936522714508678e-06, "loss": 0.021, "step": 59 }, { "epoch": 2.744186046511628, "grad_norm": 0.029319288209080696, "learning_rate": 6.866382254766157e-06, "loss": 0.0203, "step": 60 }, { "epoch": 2.744186046511628, "eval_loss": 0.027196675539016724, "eval_runtime": 144.7292, "eval_samples_per_second": 33.773, "eval_steps_per_second": 0.138, "step": 60 }, { "epoch": 2.7906976744186047, "grad_norm": 0.02521028183400631, "learning_rate": 5.060239153161872e-06, "loss": 0.0201, "step": 61 }, { "epoch": 2.8372093023255816, "grad_norm": 0.025171076878905296, "learning_rate": 3.5232131185484076e-06, "loss": 0.0197, "step": 62 }, { "epoch": 2.883720930232558, "grad_norm": 0.030205056071281433, "learning_rate": 2.259661018213333e-06, "loss": 0.0223, "step": 63 }, { "epoch": 2.9302325581395348, "grad_norm": 0.026074590161442757, "learning_rate": 1.2731645278655445e-06, "loss": 0.0183, "step": 64 }, { "epoch": 2.9767441860465116, "grad_norm": 0.025404850021004677, "learning_rate": 5.665199789862907e-07, "loss": 0.02, "step": 65 }, { "epoch": 3.0, "grad_norm": 0.03129518777132034, "learning_rate": 1.4173043232380557e-07, "loss": 0.0211, "step": 66 }, { "epoch": 3.0, "step": 66, "total_flos": 4.430124994772599e+18, "train_loss": 0.725957691979905, "train_runtime": 13355.9166, "train_samples_per_second": 9.881, "train_steps_per_second": 0.005 } ], "logging_steps": 1.0, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.430124994772599e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }