{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5e-05, "grad_norm": 4.426811695098877, "learning_rate": 4.999755000000001e-05, "loss": 6.3755, "step": 50 }, { "epoch": 0.0001, "grad_norm": 1.7730491161346436, "learning_rate": 4.999505e-05, "loss": 3.3233, "step": 100 }, { "epoch": 0.00015, "grad_norm": 1.4473446607589722, "learning_rate": 4.999255e-05, "loss": 2.7523, "step": 150 }, { "epoch": 0.0002, "grad_norm": 1.4645352363586426, "learning_rate": 4.9990050000000004e-05, "loss": 2.4618, "step": 200 }, { "epoch": 0.00025, "grad_norm": 1.1990822553634644, "learning_rate": 4.998755e-05, "loss": 2.3417, "step": 250 }, { "epoch": 0.0003, "grad_norm": 3.0459673404693604, "learning_rate": 4.998505e-05, "loss": 2.2581, "step": 300 }, { "epoch": 0.00035, "grad_norm": 1.2380167245864868, "learning_rate": 4.998255e-05, "loss": 2.1563, "step": 350 }, { "epoch": 0.0004, "grad_norm": 1.2060939073562622, "learning_rate": 4.998005e-05, "loss": 1.8521, "step": 400 }, { "epoch": 0.00045, "grad_norm": 0.9207198619842529, "learning_rate": 4.9977550000000004e-05, "loss": 1.9201, "step": 450 }, { "epoch": 0.0005, "grad_norm": 1.0762739181518555, "learning_rate": 4.997505e-05, "loss": 1.8104, "step": 500 }, { "epoch": 0.00055, "grad_norm": 1.0367337465286255, "learning_rate": 4.997255e-05, "loss": 1.8756, "step": 550 }, { "epoch": 0.0006, "grad_norm": 0.9291325211524963, "learning_rate": 4.997005e-05, "loss": 1.7404, "step": 600 }, { "epoch": 0.00065, "grad_norm": 0.7452297210693359, "learning_rate": 4.996755e-05, "loss": 1.6843, "step": 650 }, { "epoch": 0.0007, "grad_norm": 0.9317042231559753, "learning_rate": 4.9965050000000004e-05, "loss": 1.7006, "step": 700 }, { "epoch": 0.00075, "grad_norm": 1.0354136228561401, "learning_rate": 4.9962550000000005e-05, "loss": 1.5652, "step": 750 }, { "epoch": 0.0008, "grad_norm": 1.050106406211853, "learning_rate": 4.996005e-05, "loss": 1.5966, "step": 800 }, { "epoch": 0.00085, "grad_norm": 0.9646019339561462, "learning_rate": 4.995755e-05, "loss": 1.6297, "step": 850 }, { "epoch": 0.0009, "grad_norm": 0.9152287840843201, "learning_rate": 4.995505e-05, "loss": 1.6328, "step": 900 }, { "epoch": 0.00095, "grad_norm": 0.9403690099716187, "learning_rate": 4.995255e-05, "loss": 1.6056, "step": 950 }, { "epoch": 0.001, "grad_norm": 1.1822874546051025, "learning_rate": 4.9950050000000005e-05, "loss": 1.6016, "step": 1000 }, { "epoch": 0.00105, "grad_norm": 1.3227542638778687, "learning_rate": 4.9947550000000006e-05, "loss": 1.5448, "step": 1050 }, { "epoch": 0.0011, "grad_norm": 1.0503350496292114, "learning_rate": 4.994505000000001e-05, "loss": 1.6355, "step": 1100 }, { "epoch": 0.00115, "grad_norm": 1.1647204160690308, "learning_rate": 4.994255e-05, "loss": 1.4673, "step": 1150 }, { "epoch": 0.0012, "grad_norm": 0.7281339168548584, "learning_rate": 4.994005e-05, "loss": 1.4641, "step": 1200 }, { "epoch": 0.00125, "grad_norm": 1.2438446283340454, "learning_rate": 4.9937550000000004e-05, "loss": 1.5177, "step": 1250 }, { "epoch": 0.0013, "grad_norm": 0.7967873811721802, "learning_rate": 4.993505e-05, "loss": 1.4399, "step": 1300 }, { "epoch": 0.00135, "grad_norm": 0.8938255310058594, "learning_rate": 4.993255e-05, "loss": 1.4453, "step": 1350 }, { "epoch": 0.0014, "grad_norm": 1.4659217596054077, "learning_rate": 4.993005e-05, "loss": 1.6517, "step": 1400 }, { "epoch": 0.00145, "grad_norm": 0.7856793403625488, "learning_rate": 4.992755e-05, "loss": 1.4721, "step": 1450 }, { "epoch": 0.0015, "grad_norm": 0.6772142648696899, "learning_rate": 4.9925050000000004e-05, "loss": 1.5314, "step": 1500 }, { "epoch": 0.00155, "grad_norm": 0.7613831758499146, "learning_rate": 4.992255e-05, "loss": 1.4603, "step": 1550 }, { "epoch": 0.0016, "grad_norm": 1.1216529607772827, "learning_rate": 4.992005e-05, "loss": 1.54, "step": 1600 }, { "epoch": 0.00165, "grad_norm": 0.7359323501586914, "learning_rate": 4.991755e-05, "loss": 1.5121, "step": 1650 }, { "epoch": 0.0017, "grad_norm": 0.8343626260757446, "learning_rate": 4.991505e-05, "loss": 1.6107, "step": 1700 }, { "epoch": 0.00175, "grad_norm": 0.9381140470504761, "learning_rate": 4.9912550000000004e-05, "loss": 1.5059, "step": 1750 }, { "epoch": 0.0018, "grad_norm": 0.7317638993263245, "learning_rate": 4.9910050000000005e-05, "loss": 1.5621, "step": 1800 }, { "epoch": 0.00185, "grad_norm": 0.8273525834083557, "learning_rate": 4.990755e-05, "loss": 1.4737, "step": 1850 }, { "epoch": 0.0019, "grad_norm": 0.7206686735153198, "learning_rate": 4.990505e-05, "loss": 1.5242, "step": 1900 }, { "epoch": 0.00195, "grad_norm": 0.9094993472099304, "learning_rate": 4.990255e-05, "loss": 1.5336, "step": 1950 }, { "epoch": 0.002, "grad_norm": 0.7432169914245605, "learning_rate": 4.9900050000000003e-05, "loss": 1.4913, "step": 2000 }, { "epoch": 0.00205, "grad_norm": 1.337276816368103, "learning_rate": 4.9897550000000005e-05, "loss": 1.5643, "step": 2050 }, { "epoch": 0.0021, "grad_norm": 0.6664676666259766, "learning_rate": 4.9895050000000006e-05, "loss": 1.4728, "step": 2100 }, { "epoch": 0.00215, "grad_norm": 0.7946372628211975, "learning_rate": 4.989255000000001e-05, "loss": 1.4832, "step": 2150 }, { "epoch": 0.0022, "grad_norm": 0.9259161949157715, "learning_rate": 4.989005e-05, "loss": 1.5674, "step": 2200 }, { "epoch": 0.00225, "grad_norm": 0.9703247547149658, "learning_rate": 4.988755e-05, "loss": 1.3973, "step": 2250 }, { "epoch": 0.0023, "grad_norm": 0.7390087842941284, "learning_rate": 4.988505e-05, "loss": 1.3919, "step": 2300 }, { "epoch": 0.00235, "grad_norm": 0.768277645111084, "learning_rate": 4.988255e-05, "loss": 1.4815, "step": 2350 }, { "epoch": 0.0024, "grad_norm": 0.76382976770401, "learning_rate": 4.988005e-05, "loss": 1.4458, "step": 2400 }, { "epoch": 0.00245, "grad_norm": 0.7780851721763611, "learning_rate": 4.987755e-05, "loss": 1.4382, "step": 2450 }, { "epoch": 0.0025, "grad_norm": 0.7184464335441589, "learning_rate": 4.987505e-05, "loss": 1.521, "step": 2500 }, { "epoch": 0.00255, "grad_norm": 0.6212390065193176, "learning_rate": 4.9872550000000004e-05, "loss": 1.3889, "step": 2550 }, { "epoch": 0.0026, "grad_norm": 0.8539580702781677, "learning_rate": 4.987005e-05, "loss": 1.415, "step": 2600 }, { "epoch": 0.00265, "grad_norm": 0.7129775881767273, "learning_rate": 4.986755e-05, "loss": 1.4102, "step": 2650 }, { "epoch": 0.0027, "grad_norm": 0.5899195671081543, "learning_rate": 4.986505e-05, "loss": 1.3074, "step": 2700 }, { "epoch": 0.00275, "grad_norm": 0.6940101981163025, "learning_rate": 4.986255e-05, "loss": 1.3748, "step": 2750 }, { "epoch": 0.0028, "grad_norm": 0.6420891880989075, "learning_rate": 4.9860050000000004e-05, "loss": 1.4089, "step": 2800 }, { "epoch": 0.00285, "grad_norm": 0.8561428189277649, "learning_rate": 4.9857550000000005e-05, "loss": 1.4052, "step": 2850 }, { "epoch": 0.0029, "grad_norm": 0.6900970935821533, "learning_rate": 4.9855050000000006e-05, "loss": 1.4323, "step": 2900 }, { "epoch": 0.00295, "grad_norm": 0.8071371912956238, "learning_rate": 4.985255e-05, "loss": 1.3485, "step": 2950 }, { "epoch": 0.003, "grad_norm": 0.6493249535560608, "learning_rate": 4.985005e-05, "loss": 1.4321, "step": 3000 }, { "epoch": 0.00305, "grad_norm": 0.8712514042854309, "learning_rate": 4.9847550000000004e-05, "loss": 1.406, "step": 3050 }, { "epoch": 0.0031, "grad_norm": 0.8464570045471191, "learning_rate": 4.9845050000000005e-05, "loss": 1.3322, "step": 3100 }, { "epoch": 0.00315, "grad_norm": 0.7779001593589783, "learning_rate": 4.9842550000000006e-05, "loss": 1.3905, "step": 3150 }, { "epoch": 0.0032, "grad_norm": 0.8314748406410217, "learning_rate": 4.984005000000001e-05, "loss": 1.4134, "step": 3200 }, { "epoch": 0.00325, "grad_norm": 0.7180835604667664, "learning_rate": 4.983755e-05, "loss": 1.3629, "step": 3250 }, { "epoch": 0.0033, "grad_norm": 0.6165933012962341, "learning_rate": 4.9835049999999996e-05, "loss": 1.3833, "step": 3300 }, { "epoch": 0.00335, "grad_norm": 1.8662049770355225, "learning_rate": 4.983255e-05, "loss": 1.3557, "step": 3350 }, { "epoch": 0.0034, "grad_norm": 0.9091550707817078, "learning_rate": 4.983005e-05, "loss": 1.4092, "step": 3400 }, { "epoch": 0.00345, "grad_norm": 0.9908722639083862, "learning_rate": 4.982755e-05, "loss": 1.3465, "step": 3450 }, { "epoch": 0.0035, "grad_norm": 0.862427830696106, "learning_rate": 4.982505e-05, "loss": 1.3143, "step": 3500 }, { "epoch": 0.00355, "grad_norm": 0.755211591720581, "learning_rate": 4.982255e-05, "loss": 1.358, "step": 3550 }, { "epoch": 0.0036, "grad_norm": 0.7872670888900757, "learning_rate": 4.9820050000000004e-05, "loss": 1.3952, "step": 3600 }, { "epoch": 0.00365, "grad_norm": 1.5124619007110596, "learning_rate": 4.981755e-05, "loss": 1.2929, "step": 3650 }, { "epoch": 0.0037, "grad_norm": 0.7712079286575317, "learning_rate": 4.981505e-05, "loss": 1.3429, "step": 3700 }, { "epoch": 0.00375, "grad_norm": 0.7001494765281677, "learning_rate": 4.981255e-05, "loss": 1.272, "step": 3750 }, { "epoch": 0.0038, "grad_norm": 0.674104630947113, "learning_rate": 4.981005e-05, "loss": 1.3891, "step": 3800 }, { "epoch": 0.00385, "grad_norm": 1.0494478940963745, "learning_rate": 4.9807550000000004e-05, "loss": 1.333, "step": 3850 }, { "epoch": 0.0039, "grad_norm": 0.6674365401268005, "learning_rate": 4.9805050000000005e-05, "loss": 1.4418, "step": 3900 }, { "epoch": 0.00395, "grad_norm": 0.7624682784080505, "learning_rate": 4.9802550000000007e-05, "loss": 1.2837, "step": 3950 }, { "epoch": 0.004, "grad_norm": 1.5128511190414429, "learning_rate": 4.980005e-05, "loss": 1.2609, "step": 4000 }, { "epoch": 0.00405, "grad_norm": 0.5422250628471375, "learning_rate": 4.979755e-05, "loss": 1.3253, "step": 4050 }, { "epoch": 0.0041, "grad_norm": 0.63419508934021, "learning_rate": 4.9795050000000004e-05, "loss": 1.3369, "step": 4100 }, { "epoch": 0.00415, "grad_norm": 0.7608025670051575, "learning_rate": 4.9792550000000005e-05, "loss": 1.3186, "step": 4150 }, { "epoch": 0.0042, "grad_norm": 0.6282067894935608, "learning_rate": 4.9790050000000006e-05, "loss": 1.3755, "step": 4200 }, { "epoch": 0.00425, "grad_norm": 0.7882742881774902, "learning_rate": 4.978755000000001e-05, "loss": 1.3326, "step": 4250 }, { "epoch": 0.0043, "grad_norm": 0.6797521710395813, "learning_rate": 4.978505e-05, "loss": 1.2774, "step": 4300 }, { "epoch": 0.00435, "grad_norm": 0.6129476428031921, "learning_rate": 4.978255e-05, "loss": 1.324, "step": 4350 }, { "epoch": 0.0044, "grad_norm": 0.7544896006584167, "learning_rate": 4.978005e-05, "loss": 1.299, "step": 4400 }, { "epoch": 0.00445, "grad_norm": 0.6218116283416748, "learning_rate": 4.977755e-05, "loss": 1.3084, "step": 4450 }, { "epoch": 0.0045, "grad_norm": 0.5591554045677185, "learning_rate": 4.977505e-05, "loss": 1.3001, "step": 4500 }, { "epoch": 0.00455, "grad_norm": 0.854743242263794, "learning_rate": 4.977255e-05, "loss": 1.3357, "step": 4550 }, { "epoch": 0.0046, "grad_norm": 0.7265657186508179, "learning_rate": 4.977005e-05, "loss": 1.3362, "step": 4600 }, { "epoch": 0.00465, "grad_norm": 0.8089432120323181, "learning_rate": 4.9767550000000004e-05, "loss": 1.3087, "step": 4650 }, { "epoch": 0.0047, "grad_norm": 0.8999931216239929, "learning_rate": 4.976505e-05, "loss": 1.3894, "step": 4700 }, { "epoch": 0.00475, "grad_norm": 0.7705693244934082, "learning_rate": 4.976255e-05, "loss": 1.2585, "step": 4750 }, { "epoch": 0.0048, "grad_norm": 0.7201912999153137, "learning_rate": 4.976005e-05, "loss": 1.221, "step": 4800 }, { "epoch": 0.00485, "grad_norm": 0.6199264526367188, "learning_rate": 4.975755e-05, "loss": 1.2878, "step": 4850 }, { "epoch": 0.0049, "grad_norm": 0.6883070468902588, "learning_rate": 4.9755050000000004e-05, "loss": 1.3032, "step": 4900 }, { "epoch": 0.00495, "grad_norm": 0.524889349937439, "learning_rate": 4.9752550000000005e-05, "loss": 1.2706, "step": 4950 }, { "epoch": 0.005, "grad_norm": 0.7067034244537354, "learning_rate": 4.9750050000000007e-05, "loss": 1.2424, "step": 5000 }, { "epoch": 0.00505, "grad_norm": 0.7097471356391907, "learning_rate": 4.974755e-05, "loss": 1.3175, "step": 5050 }, { "epoch": 0.0051, "grad_norm": 0.6991677284240723, "learning_rate": 4.974505e-05, "loss": 1.2983, "step": 5100 }, { "epoch": 0.00515, "grad_norm": 1.4025177955627441, "learning_rate": 4.9742550000000004e-05, "loss": 1.2695, "step": 5150 }, { "epoch": 0.0052, "grad_norm": 0.6075018048286438, "learning_rate": 4.9740050000000005e-05, "loss": 1.2973, "step": 5200 }, { "epoch": 0.00525, "grad_norm": 0.5344257950782776, "learning_rate": 4.9737550000000006e-05, "loss": 1.2353, "step": 5250 }, { "epoch": 0.0053, "grad_norm": 0.5963959693908691, "learning_rate": 4.973505e-05, "loss": 1.2359, "step": 5300 }, { "epoch": 0.00535, "grad_norm": 0.9493517279624939, "learning_rate": 4.973255e-05, "loss": 1.252, "step": 5350 }, { "epoch": 0.0054, "grad_norm": 0.6466087102890015, "learning_rate": 4.9730050000000003e-05, "loss": 1.2012, "step": 5400 }, { "epoch": 0.00545, "grad_norm": 0.6407445669174194, "learning_rate": 4.972755e-05, "loss": 1.2615, "step": 5450 }, { "epoch": 0.0055, "grad_norm": 0.617421567440033, "learning_rate": 4.972505e-05, "loss": 1.2718, "step": 5500 }, { "epoch": 0.00555, "grad_norm": 0.5649735331535339, "learning_rate": 4.972255e-05, "loss": 1.2439, "step": 5550 }, { "epoch": 0.0056, "grad_norm": 0.675645649433136, "learning_rate": 4.972005e-05, "loss": 1.2847, "step": 5600 }, { "epoch": 0.00565, "grad_norm": 0.6887643337249756, "learning_rate": 4.971755e-05, "loss": 1.2464, "step": 5650 }, { "epoch": 0.0057, "grad_norm": 0.6572535037994385, "learning_rate": 4.9715050000000004e-05, "loss": 1.2517, "step": 5700 }, { "epoch": 0.00575, "grad_norm": 0.6076003313064575, "learning_rate": 4.971255e-05, "loss": 1.2183, "step": 5750 }, { "epoch": 0.0058, "grad_norm": 0.6134760975837708, "learning_rate": 4.971005e-05, "loss": 1.2303, "step": 5800 }, { "epoch": 0.00585, "grad_norm": 0.6791936159133911, "learning_rate": 4.970755e-05, "loss": 1.2351, "step": 5850 }, { "epoch": 0.0059, "grad_norm": 0.6307588219642639, "learning_rate": 4.970505e-05, "loss": 1.254, "step": 5900 }, { "epoch": 0.00595, "grad_norm": 1.1272459030151367, "learning_rate": 4.9702550000000004e-05, "loss": 1.265, "step": 5950 }, { "epoch": 0.006, "grad_norm": 0.5415930151939392, "learning_rate": 4.9700050000000005e-05, "loss": 1.2068, "step": 6000 }, { "epoch": 0.00605, "grad_norm": 0.5450507402420044, "learning_rate": 4.969755000000001e-05, "loss": 1.2408, "step": 6050 }, { "epoch": 0.0061, "grad_norm": 0.6528737545013428, "learning_rate": 4.969505e-05, "loss": 1.2035, "step": 6100 }, { "epoch": 0.00615, "grad_norm": 0.6139540672302246, "learning_rate": 4.969255e-05, "loss": 1.2077, "step": 6150 }, { "epoch": 0.0062, "grad_norm": 0.6503865718841553, "learning_rate": 4.9690050000000004e-05, "loss": 1.2055, "step": 6200 }, { "epoch": 0.00625, "grad_norm": 0.6888736486434937, "learning_rate": 4.9687550000000005e-05, "loss": 1.2161, "step": 6250 }, { "epoch": 0.0063, "grad_norm": 1.1457703113555908, "learning_rate": 4.968505e-05, "loss": 1.1939, "step": 6300 }, { "epoch": 0.00635, "grad_norm": 0.5861665606498718, "learning_rate": 4.968255e-05, "loss": 1.2177, "step": 6350 }, { "epoch": 0.0064, "grad_norm": 0.8666422963142395, "learning_rate": 4.968005e-05, "loss": 1.1945, "step": 6400 }, { "epoch": 0.00645, "grad_norm": 0.6094856858253479, "learning_rate": 4.9677550000000003e-05, "loss": 1.2394, "step": 6450 }, { "epoch": 0.0065, "grad_norm": 0.6510404348373413, "learning_rate": 4.967505e-05, "loss": 1.1764, "step": 6500 }, { "epoch": 0.00655, "grad_norm": 0.7044185996055603, "learning_rate": 4.967255e-05, "loss": 1.1951, "step": 6550 }, { "epoch": 0.0066, "grad_norm": 0.6577385663986206, "learning_rate": 4.967005e-05, "loss": 1.2186, "step": 6600 }, { "epoch": 0.00665, "grad_norm": 0.6468052268028259, "learning_rate": 4.966755e-05, "loss": 1.2491, "step": 6650 }, { "epoch": 0.0067, "grad_norm": 0.6153103709220886, "learning_rate": 4.966505e-05, "loss": 1.25, "step": 6700 }, { "epoch": 0.00675, "grad_norm": 0.6066887974739075, "learning_rate": 4.9662550000000004e-05, "loss": 1.1852, "step": 6750 }, { "epoch": 0.0068, "grad_norm": 0.5735403895378113, "learning_rate": 4.9660050000000006e-05, "loss": 1.2139, "step": 6800 }, { "epoch": 0.00685, "grad_norm": 0.6247462630271912, "learning_rate": 4.965755e-05, "loss": 1.1597, "step": 6850 }, { "epoch": 0.0069, "grad_norm": 0.6690845489501953, "learning_rate": 4.965505e-05, "loss": 1.2095, "step": 6900 }, { "epoch": 0.00695, "grad_norm": 0.5683197975158691, "learning_rate": 4.965255e-05, "loss": 1.3077, "step": 6950 }, { "epoch": 0.007, "grad_norm": 0.6314153671264648, "learning_rate": 4.9650050000000004e-05, "loss": 1.2556, "step": 7000 }, { "epoch": 0.00705, "grad_norm": 0.641153872013092, "learning_rate": 4.9647550000000005e-05, "loss": 1.1643, "step": 7050 }, { "epoch": 0.0071, "grad_norm": 0.6113337278366089, "learning_rate": 4.964505000000001e-05, "loss": 1.1688, "step": 7100 }, { "epoch": 0.00715, "grad_norm": 0.5102031230926514, "learning_rate": 4.964255e-05, "loss": 1.1835, "step": 7150 }, { "epoch": 0.0072, "grad_norm": 0.6926486492156982, "learning_rate": 4.964005e-05, "loss": 1.1704, "step": 7200 }, { "epoch": 0.00725, "grad_norm": 0.594326376914978, "learning_rate": 4.9637550000000004e-05, "loss": 1.1434, "step": 7250 }, { "epoch": 0.0073, "grad_norm": 0.5609344840049744, "learning_rate": 4.963505e-05, "loss": 1.2389, "step": 7300 }, { "epoch": 0.00735, "grad_norm": 0.6464629173278809, "learning_rate": 4.963255e-05, "loss": 1.2097, "step": 7350 }, { "epoch": 0.0074, "grad_norm": 0.5782963633537292, "learning_rate": 4.963005e-05, "loss": 1.2607, "step": 7400 }, { "epoch": 0.00745, "grad_norm": 0.7301307320594788, "learning_rate": 4.962755e-05, "loss": 1.1759, "step": 7450 }, { "epoch": 0.0075, "grad_norm": 0.5845964550971985, "learning_rate": 4.9625050000000004e-05, "loss": 1.1806, "step": 7500 }, { "epoch": 0.00755, "grad_norm": 0.618295431137085, "learning_rate": 4.962255e-05, "loss": 1.1339, "step": 7550 }, { "epoch": 0.0076, "grad_norm": 0.5799853801727295, "learning_rate": 4.962005e-05, "loss": 1.1641, "step": 7600 }, { "epoch": 0.00765, "grad_norm": 0.5227323174476624, "learning_rate": 4.961755e-05, "loss": 1.2375, "step": 7650 }, { "epoch": 0.0077, "grad_norm": 0.699111819267273, "learning_rate": 4.961505e-05, "loss": 1.0943, "step": 7700 }, { "epoch": 0.00775, "grad_norm": 0.5230436325073242, "learning_rate": 4.961255e-05, "loss": 1.1762, "step": 7750 }, { "epoch": 0.0078, "grad_norm": 0.5776082873344421, "learning_rate": 4.9610050000000005e-05, "loss": 1.18, "step": 7800 }, { "epoch": 0.00785, "grad_norm": 0.584697425365448, "learning_rate": 4.9607550000000006e-05, "loss": 1.2878, "step": 7850 }, { "epoch": 0.0079, "grad_norm": 0.66282057762146, "learning_rate": 4.960505e-05, "loss": 1.219, "step": 7900 }, { "epoch": 0.00795, "grad_norm": 1.2405346632003784, "learning_rate": 4.960255e-05, "loss": 1.0984, "step": 7950 }, { "epoch": 0.008, "grad_norm": 0.6882185339927673, "learning_rate": 4.960005e-05, "loss": 1.2291, "step": 8000 }, { "epoch": 0.00805, "grad_norm": 0.5376139879226685, "learning_rate": 4.9597550000000004e-05, "loss": 1.1768, "step": 8050 }, { "epoch": 0.0081, "grad_norm": 0.5613966584205627, "learning_rate": 4.9595050000000006e-05, "loss": 1.1882, "step": 8100 }, { "epoch": 0.00815, "grad_norm": 0.5976732969284058, "learning_rate": 4.959255000000001e-05, "loss": 1.1938, "step": 8150 }, { "epoch": 0.0082, "grad_norm": 0.5053662657737732, "learning_rate": 4.959005e-05, "loss": 1.1335, "step": 8200 }, { "epoch": 0.00825, "grad_norm": 0.5319002866744995, "learning_rate": 4.958755e-05, "loss": 1.23, "step": 8250 }, { "epoch": 0.0083, "grad_norm": 0.6441113948822021, "learning_rate": 4.9585050000000004e-05, "loss": 1.095, "step": 8300 }, { "epoch": 0.00835, "grad_norm": 0.7779256701469421, "learning_rate": 4.958255e-05, "loss": 1.1637, "step": 8350 }, { "epoch": 0.0084, "grad_norm": 0.6262242794036865, "learning_rate": 4.958005e-05, "loss": 1.1756, "step": 8400 }, { "epoch": 0.00845, "grad_norm": 0.6683831214904785, "learning_rate": 4.957755e-05, "loss": 1.2142, "step": 8450 }, { "epoch": 0.0085, "grad_norm": 0.8390682339668274, "learning_rate": 4.957505e-05, "loss": 1.147, "step": 8500 }, { "epoch": 0.00855, "grad_norm": 0.6620815396308899, "learning_rate": 4.9572550000000004e-05, "loss": 1.1619, "step": 8550 }, { "epoch": 0.0086, "grad_norm": 0.5890603065490723, "learning_rate": 4.957005e-05, "loss": 1.1076, "step": 8600 }, { "epoch": 0.00865, "grad_norm": 0.578610897064209, "learning_rate": 4.956755e-05, "loss": 1.0664, "step": 8650 }, { "epoch": 0.0087, "grad_norm": 0.5939948558807373, "learning_rate": 4.956505e-05, "loss": 1.1613, "step": 8700 }, { "epoch": 0.00875, "grad_norm": 0.6854498386383057, "learning_rate": 4.956255e-05, "loss": 1.218, "step": 8750 }, { "epoch": 0.0088, "grad_norm": 0.8202462196350098, "learning_rate": 4.956005e-05, "loss": 1.2239, "step": 8800 }, { "epoch": 0.00885, "grad_norm": 0.5740517973899841, "learning_rate": 4.9557550000000005e-05, "loss": 1.1104, "step": 8850 }, { "epoch": 0.0089, "grad_norm": 0.7225694060325623, "learning_rate": 4.9555050000000006e-05, "loss": 1.1617, "step": 8900 }, { "epoch": 0.00895, "grad_norm": 0.5652306079864502, "learning_rate": 4.955255e-05, "loss": 1.0913, "step": 8950 }, { "epoch": 0.009, "grad_norm": 0.9501689672470093, "learning_rate": 4.955005e-05, "loss": 1.1862, "step": 9000 } ], "logging_steps": 50, "max_steps": 1000000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.67761573576704e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }