{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.081632653061225, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04081632653061224, "grad_norm": 7.114395618438721, "learning_rate": 9.981632653061225e-06, "loss": 0.7362, "step": 10 }, { "epoch": 0.08163265306122448, "grad_norm": 11.572301864624023, "learning_rate": 9.961224489795919e-06, "loss": 0.8729, "step": 20 }, { "epoch": 0.12244897959183673, "grad_norm": 9.383491516113281, "learning_rate": 9.940816326530614e-06, "loss": 0.773, "step": 30 }, { "epoch": 0.16326530612244897, "grad_norm": 7.83120059967041, "learning_rate": 9.920408163265307e-06, "loss": 0.7817, "step": 40 }, { "epoch": 0.20408163265306123, "grad_norm": 10.92087173461914, "learning_rate": 9.9e-06, "loss": 0.6256, "step": 50 }, { "epoch": 0.24489795918367346, "grad_norm": 3.8826725482940674, "learning_rate": 9.879591836734695e-06, "loss": 0.5759, "step": 60 }, { "epoch": 0.2857142857142857, "grad_norm": 11.15483283996582, "learning_rate": 9.859183673469388e-06, "loss": 0.7333, "step": 70 }, { "epoch": 0.32653061224489793, "grad_norm": 11.470726013183594, "learning_rate": 9.838775510204083e-06, "loss": 0.5943, "step": 80 }, { "epoch": 0.3673469387755102, "grad_norm": 13.159674644470215, "learning_rate": 9.818367346938777e-06, "loss": 0.7804, "step": 90 }, { "epoch": 0.40816326530612246, "grad_norm": 9.58558464050293, "learning_rate": 9.79795918367347e-06, "loss": 0.6491, "step": 100 }, { "epoch": 0.4489795918367347, "grad_norm": 9.653897285461426, "learning_rate": 9.777551020408163e-06, "loss": 0.5919, "step": 110 }, { "epoch": 0.4897959183673469, "grad_norm": 8.117432594299316, "learning_rate": 9.757142857142858e-06, "loss": 0.4571, "step": 120 }, { "epoch": 0.5306122448979592, "grad_norm": 6.9328460693359375, "learning_rate": 9.736734693877551e-06, "loss": 0.6597, "step": 130 }, { "epoch": 0.5714285714285714, "grad_norm": 7.962501049041748, "learning_rate": 9.716326530612246e-06, "loss": 0.5132, "step": 140 }, { "epoch": 0.6122448979591837, "grad_norm": 10.508763313293457, "learning_rate": 9.69591836734694e-06, "loss": 0.6893, "step": 150 }, { "epoch": 0.6530612244897959, "grad_norm": 7.637253761291504, "learning_rate": 9.675510204081635e-06, "loss": 0.6142, "step": 160 }, { "epoch": 0.6938775510204082, "grad_norm": 10.0332670211792, "learning_rate": 9.655102040816328e-06, "loss": 0.582, "step": 170 }, { "epoch": 0.7346938775510204, "grad_norm": 8.150875091552734, "learning_rate": 9.634693877551021e-06, "loss": 0.477, "step": 180 }, { "epoch": 0.7755102040816326, "grad_norm": 10.330913543701172, "learning_rate": 9.614285714285714e-06, "loss": 0.5916, "step": 190 }, { "epoch": 0.8163265306122449, "grad_norm": 11.654999732971191, "learning_rate": 9.593877551020408e-06, "loss": 0.6236, "step": 200 }, { "epoch": 0.8571428571428571, "grad_norm": 8.048078536987305, "learning_rate": 9.573469387755103e-06, "loss": 0.6142, "step": 210 }, { "epoch": 0.8979591836734694, "grad_norm": 9.869592666625977, "learning_rate": 9.553061224489798e-06, "loss": 0.625, "step": 220 }, { "epoch": 0.9387755102040817, "grad_norm": 8.321409225463867, "learning_rate": 9.532653061224491e-06, "loss": 0.5767, "step": 230 }, { "epoch": 0.9795918367346939, "grad_norm": 7.6769256591796875, "learning_rate": 9.512244897959184e-06, "loss": 0.5134, "step": 240 }, { "epoch": 1.0204081632653061, "grad_norm": 9.609123229980469, "learning_rate": 9.491836734693877e-06, "loss": 0.5868, "step": 250 }, { "epoch": 1.0612244897959184, "grad_norm": 9.19683837890625, "learning_rate": 9.471428571428572e-06, "loss": 0.5215, "step": 260 }, { "epoch": 1.1020408163265305, "grad_norm": 7.328164577484131, "learning_rate": 9.451020408163266e-06, "loss": 0.5422, "step": 270 }, { "epoch": 1.1428571428571428, "grad_norm": 6.913904190063477, "learning_rate": 9.430612244897959e-06, "loss": 0.5214, "step": 280 }, { "epoch": 1.183673469387755, "grad_norm": 9.28811264038086, "learning_rate": 9.410204081632654e-06, "loss": 0.5319, "step": 290 }, { "epoch": 1.2244897959183674, "grad_norm": 9.132966041564941, "learning_rate": 9.389795918367349e-06, "loss": 0.5581, "step": 300 }, { "epoch": 1.2653061224489797, "grad_norm": 6.9722065925598145, "learning_rate": 9.369387755102042e-06, "loss": 0.431, "step": 310 }, { "epoch": 1.306122448979592, "grad_norm": 5.06177282333374, "learning_rate": 9.348979591836736e-06, "loss": 0.4583, "step": 320 }, { "epoch": 1.346938775510204, "grad_norm": 7.732840538024902, "learning_rate": 9.328571428571429e-06, "loss": 0.4194, "step": 330 }, { "epoch": 1.3877551020408163, "grad_norm": 8.94101333618164, "learning_rate": 9.308163265306122e-06, "loss": 0.4519, "step": 340 }, { "epoch": 1.4285714285714286, "grad_norm": 7.5437750816345215, "learning_rate": 9.287755102040817e-06, "loss": 0.5095, "step": 350 }, { "epoch": 1.469387755102041, "grad_norm": 5.702700138092041, "learning_rate": 9.26734693877551e-06, "loss": 0.3936, "step": 360 }, { "epoch": 1.510204081632653, "grad_norm": 9.153871536254883, "learning_rate": 9.246938775510205e-06, "loss": 0.4566, "step": 370 }, { "epoch": 1.5510204081632653, "grad_norm": 13.249794006347656, "learning_rate": 9.226530612244899e-06, "loss": 0.5216, "step": 380 }, { "epoch": 1.5918367346938775, "grad_norm": 7.065913200378418, "learning_rate": 9.206122448979594e-06, "loss": 0.4562, "step": 390 }, { "epoch": 1.6326530612244898, "grad_norm": 7.559301853179932, "learning_rate": 9.185714285714287e-06, "loss": 0.3883, "step": 400 }, { "epoch": 1.6734693877551021, "grad_norm": 12.103629112243652, "learning_rate": 9.16530612244898e-06, "loss": 0.4149, "step": 410 }, { "epoch": 1.7142857142857144, "grad_norm": 7.9720072746276855, "learning_rate": 9.144897959183673e-06, "loss": 0.4718, "step": 420 }, { "epoch": 1.7551020408163265, "grad_norm": 4.845782279968262, "learning_rate": 9.124489795918368e-06, "loss": 0.4304, "step": 430 }, { "epoch": 1.7959183673469388, "grad_norm": 6.954368591308594, "learning_rate": 9.104081632653062e-06, "loss": 0.3436, "step": 440 }, { "epoch": 1.836734693877551, "grad_norm": 4.751299858093262, "learning_rate": 9.083673469387757e-06, "loss": 0.4366, "step": 450 }, { "epoch": 1.8775510204081631, "grad_norm": 6.507364273071289, "learning_rate": 9.06326530612245e-06, "loss": 0.5794, "step": 460 }, { "epoch": 1.9183673469387754, "grad_norm": 8.891802787780762, "learning_rate": 9.042857142857143e-06, "loss": 0.4616, "step": 470 }, { "epoch": 1.9591836734693877, "grad_norm": 10.056327819824219, "learning_rate": 9.022448979591838e-06, "loss": 0.4946, "step": 480 }, { "epoch": 2.0, "grad_norm": 7.899660110473633, "learning_rate": 9.002040816326531e-06, "loss": 0.4437, "step": 490 }, { "epoch": 2.0408163265306123, "grad_norm": 6.761326313018799, "learning_rate": 8.981632653061225e-06, "loss": 0.4303, "step": 500 }, { "epoch": 2.0816326530612246, "grad_norm": 8.639615058898926, "learning_rate": 8.96122448979592e-06, "loss": 0.3267, "step": 510 }, { "epoch": 2.122448979591837, "grad_norm": 7.710758209228516, "learning_rate": 8.940816326530613e-06, "loss": 0.3559, "step": 520 }, { "epoch": 2.163265306122449, "grad_norm": 6.812905311584473, "learning_rate": 8.920408163265308e-06, "loss": 0.4761, "step": 530 }, { "epoch": 2.204081632653061, "grad_norm": 7.2431511878967285, "learning_rate": 8.900000000000001e-06, "loss": 0.405, "step": 540 }, { "epoch": 2.2448979591836733, "grad_norm": 7.230724811553955, "learning_rate": 8.879591836734694e-06, "loss": 0.3638, "step": 550 }, { "epoch": 2.2857142857142856, "grad_norm": 9.520208358764648, "learning_rate": 8.859183673469388e-06, "loss": 0.3473, "step": 560 }, { "epoch": 2.326530612244898, "grad_norm": 7.048585414886475, "learning_rate": 8.838775510204083e-06, "loss": 0.3652, "step": 570 }, { "epoch": 2.36734693877551, "grad_norm": 6.979404449462891, "learning_rate": 8.818367346938776e-06, "loss": 0.3855, "step": 580 }, { "epoch": 2.4081632653061225, "grad_norm": 3.765305280685425, "learning_rate": 8.797959183673471e-06, "loss": 0.3452, "step": 590 }, { "epoch": 2.4489795918367347, "grad_norm": 10.533697128295898, "learning_rate": 8.777551020408164e-06, "loss": 0.3874, "step": 600 }, { "epoch": 2.489795918367347, "grad_norm": 8.108145713806152, "learning_rate": 8.757142857142858e-06, "loss": 0.3695, "step": 610 }, { "epoch": 2.5306122448979593, "grad_norm": 7.947360992431641, "learning_rate": 8.736734693877552e-06, "loss": 0.408, "step": 620 }, { "epoch": 2.571428571428571, "grad_norm": 7.8081374168396, "learning_rate": 8.716326530612246e-06, "loss": 0.4059, "step": 630 }, { "epoch": 2.612244897959184, "grad_norm": 8.579155921936035, "learning_rate": 8.695918367346939e-06, "loss": 0.3934, "step": 640 }, { "epoch": 2.6530612244897958, "grad_norm": 6.4387712478637695, "learning_rate": 8.675510204081632e-06, "loss": 0.4256, "step": 650 }, { "epoch": 2.693877551020408, "grad_norm": 8.415692329406738, "learning_rate": 8.655102040816327e-06, "loss": 0.3453, "step": 660 }, { "epoch": 2.7346938775510203, "grad_norm": 8.50904369354248, "learning_rate": 8.63469387755102e-06, "loss": 0.4766, "step": 670 }, { "epoch": 2.7755102040816326, "grad_norm": 4.662519931793213, "learning_rate": 8.614285714285716e-06, "loss": 0.3973, "step": 680 }, { "epoch": 2.816326530612245, "grad_norm": 6.288435935974121, "learning_rate": 8.593877551020409e-06, "loss": 0.4408, "step": 690 }, { "epoch": 2.857142857142857, "grad_norm": 6.625838279724121, "learning_rate": 8.573469387755102e-06, "loss": 0.2908, "step": 700 }, { "epoch": 2.8979591836734695, "grad_norm": 8.510032653808594, "learning_rate": 8.553061224489797e-06, "loss": 0.3813, "step": 710 }, { "epoch": 2.938775510204082, "grad_norm": 11.82463264465332, "learning_rate": 8.53265306122449e-06, "loss": 0.4352, "step": 720 }, { "epoch": 2.979591836734694, "grad_norm": 8.821819305419922, "learning_rate": 8.512244897959184e-06, "loss": 0.4318, "step": 730 }, { "epoch": 3.020408163265306, "grad_norm": 8.010713577270508, "learning_rate": 8.491836734693879e-06, "loss": 0.2323, "step": 740 }, { "epoch": 3.061224489795918, "grad_norm": 9.03991985321045, "learning_rate": 8.471428571428572e-06, "loss": 0.3603, "step": 750 }, { "epoch": 3.1020408163265305, "grad_norm": 10.94204044342041, "learning_rate": 8.451020408163267e-06, "loss": 0.3576, "step": 760 }, { "epoch": 3.142857142857143, "grad_norm": 7.89410924911499, "learning_rate": 8.43061224489796e-06, "loss": 0.2851, "step": 770 }, { "epoch": 3.183673469387755, "grad_norm": 6.53656005859375, "learning_rate": 8.410204081632653e-06, "loss": 0.318, "step": 780 }, { "epoch": 3.2244897959183674, "grad_norm": 6.487284183502197, "learning_rate": 8.389795918367347e-06, "loss": 0.317, "step": 790 }, { "epoch": 3.2653061224489797, "grad_norm": 6.947931289672852, "learning_rate": 8.369387755102042e-06, "loss": 0.2879, "step": 800 }, { "epoch": 3.306122448979592, "grad_norm": 4.166048526763916, "learning_rate": 8.348979591836735e-06, "loss": 0.3392, "step": 810 }, { "epoch": 3.3469387755102042, "grad_norm": 9.974846839904785, "learning_rate": 8.32857142857143e-06, "loss": 0.3663, "step": 820 }, { "epoch": 3.387755102040816, "grad_norm": 9.668428421020508, "learning_rate": 8.308163265306123e-06, "loss": 0.3212, "step": 830 }, { "epoch": 3.4285714285714284, "grad_norm": 11.81507396697998, "learning_rate": 8.287755102040816e-06, "loss": 0.3241, "step": 840 }, { "epoch": 3.4693877551020407, "grad_norm": 13.690321922302246, "learning_rate": 8.267346938775511e-06, "loss": 0.4535, "step": 850 }, { "epoch": 3.510204081632653, "grad_norm": 11.042778968811035, "learning_rate": 8.246938775510205e-06, "loss": 0.3826, "step": 860 }, { "epoch": 3.5510204081632653, "grad_norm": 8.57719612121582, "learning_rate": 8.226530612244898e-06, "loss": 0.3905, "step": 870 }, { "epoch": 3.5918367346938775, "grad_norm": 7.843425750732422, "learning_rate": 8.206122448979591e-06, "loss": 0.3125, "step": 880 }, { "epoch": 3.63265306122449, "grad_norm": 5.9236931800842285, "learning_rate": 8.185714285714286e-06, "loss": 0.3512, "step": 890 }, { "epoch": 3.673469387755102, "grad_norm": 8.213603973388672, "learning_rate": 8.165306122448981e-06, "loss": 0.4094, "step": 900 }, { "epoch": 3.7142857142857144, "grad_norm": 3.8083949089050293, "learning_rate": 8.144897959183674e-06, "loss": 0.2751, "step": 910 }, { "epoch": 3.7551020408163263, "grad_norm": 12.339240074157715, "learning_rate": 8.124489795918368e-06, "loss": 0.3296, "step": 920 }, { "epoch": 3.795918367346939, "grad_norm": 9.532052040100098, "learning_rate": 8.104081632653061e-06, "loss": 0.3033, "step": 930 }, { "epoch": 3.836734693877551, "grad_norm": 6.307032108306885, "learning_rate": 8.083673469387756e-06, "loss": 0.3765, "step": 940 }, { "epoch": 3.877551020408163, "grad_norm": 7.3003010749816895, "learning_rate": 8.06326530612245e-06, "loss": 0.2161, "step": 950 }, { "epoch": 3.9183673469387754, "grad_norm": 7.6572699546813965, "learning_rate": 8.042857142857143e-06, "loss": 0.2886, "step": 960 }, { "epoch": 3.9591836734693877, "grad_norm": 6.745776176452637, "learning_rate": 8.022448979591838e-06, "loss": 0.3376, "step": 970 }, { "epoch": 4.0, "grad_norm": 10.482270240783691, "learning_rate": 8.002040816326533e-06, "loss": 0.2657, "step": 980 }, { "epoch": 4.040816326530612, "grad_norm": 6.213717460632324, "learning_rate": 7.981632653061226e-06, "loss": 0.2596, "step": 990 }, { "epoch": 4.081632653061225, "grad_norm": 10.256094932556152, "learning_rate": 7.961224489795919e-06, "loss": 0.2338, "step": 1000 } ], "logging_steps": 10, "max_steps": 4900, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 30, "trial_name": null, "trial_params": null }