[ { "step": 1, "epoch": 0.0005407963225850064, "wallclock": "2026-06-24T03:50:12.865305", "loss": 0.4579, "grad_norm": 2.3048150539398193, "learning_rate": 9.00900900900901e-08 }, { "step": 5, "epoch": 0.002703981612925032, "wallclock": "2026-06-24T03:51:38.176434", "loss": 0.3614, "grad_norm": 1.8860491514205933, "learning_rate": 4.504504504504505e-07, "step_time_sec": 85.31 }, { "step": 10, "epoch": 0.005407963225850064, "wallclock": "2026-06-24T03:53:23.303779", "loss": 0.4862, "grad_norm": 2.008655548095703, "learning_rate": 9.00900900900901e-07, "step_time_sec": 105.13 }, { "step": 15, "epoch": 0.008111944838775096, "wallclock": "2026-06-24T03:55:09.377081", "loss": 0.4649, "grad_norm": 2.2208123207092285, "learning_rate": 1.3513513513513515e-06, "step_time_sec": 106.07 }, { "step": 20, "epoch": 0.010815926451700129, "wallclock": "2026-06-24T03:56:56.093355", "loss": 0.4136, "grad_norm": 1.8327311277389526, "learning_rate": 1.801801801801802e-06, "step_time_sec": 106.72 }, { "step": 25, "epoch": 0.01351990806462516, "wallclock": "2026-06-24T03:58:40.595516", "loss": 0.4941, "grad_norm": 2.889826536178589, "learning_rate": 2.2522522522522524e-06, "step_time_sec": 104.5 }, { "step": 30, "epoch": 0.01622388967755019, "wallclock": "2026-06-24T04:00:26.401533", "loss": 0.3599, "grad_norm": 2.0262131690979004, "learning_rate": 2.702702702702703e-06, "step_time_sec": 105.81 }, { "step": 35, "epoch": 0.018927871290475226, "wallclock": "2026-06-24T04:02:12.105552", "loss": 0.2975, "grad_norm": 1.627108097076416, "learning_rate": 3.1531531531531532e-06, "step_time_sec": 105.7 }, { "step": 40, "epoch": 0.021631852903400257, "wallclock": "2026-06-24T04:03:56.186851", "loss": 0.3563, "grad_norm": 1.6855164766311646, "learning_rate": 3.603603603603604e-06, "step_time_sec": 104.08 }, { "step": 45, "epoch": 0.02433583451632529, "wallclock": "2026-06-24T04:05:41.990511", "loss": 0.3596, "grad_norm": 1.661110520362854, "learning_rate": 4.0540540540540545e-06, "step_time_sec": 105.8 }, { "step": 50, "epoch": 0.02703981612925032, "wallclock": "2026-06-24T04:07:27.997794", "loss": 0.267, "grad_norm": 1.2917487621307373, "learning_rate": 4.504504504504505e-06, "step_time_sec": 106.01, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 75.27 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 55, "epoch": 0.029743797742175355, "wallclock": "2026-06-24T04:09:12.604118", "loss": 0.2226, "grad_norm": 0.9745638370513916, "learning_rate": 4.954954954954955e-06, "step_time_sec": 104.61 }, { "step": 60, "epoch": 0.03244777935510038, "wallclock": "2026-06-24T04:10:57.892162", "loss": 0.1948, "grad_norm": 1.2490293979644775, "learning_rate": 5.405405405405406e-06, "step_time_sec": 105.29 }, { "step": 65, "epoch": 0.03515176096802542, "wallclock": "2026-06-24T04:12:44.688074", "loss": 0.2015, "grad_norm": 0.9993012547492981, "learning_rate": 5.855855855855856e-06, "step_time_sec": 106.8 }, { "step": 70, "epoch": 0.03785574258095045, "wallclock": "2026-06-24T04:14:30.291858", "loss": 0.2271, "grad_norm": 1.3109948635101318, "learning_rate": 6.3063063063063065e-06, "step_time_sec": 105.6 }, { "step": 75, "epoch": 0.040559724193875484, "wallclock": "2026-06-24T04:16:14.219008", "loss": 0.157, "grad_norm": 0.6500820517539978, "learning_rate": 6.7567567567567575e-06, "step_time_sec": 103.93 }, { "step": 80, "epoch": 0.043263705806800515, "wallclock": "2026-06-24T04:17:59.413498", "loss": 0.1579, "grad_norm": 0.8443478345870972, "learning_rate": 7.207207207207208e-06, "step_time_sec": 105.19 }, { "step": 85, "epoch": 0.045967687419725546, "wallclock": "2026-06-24T04:19:44.517567", "loss": 0.1641, "grad_norm": 0.9615593552589417, "learning_rate": 7.657657657657658e-06, "step_time_sec": 105.1 }, { "step": 90, "epoch": 0.04867166903265058, "wallclock": "2026-06-24T04:21:28.592924", "loss": 0.1288, "grad_norm": 0.6482295989990234, "learning_rate": 8.108108108108109e-06, "step_time_sec": 104.08 }, { "step": 95, "epoch": 0.05137565064557561, "wallclock": "2026-06-24T04:23:14.290954", "loss": 0.136, "grad_norm": 0.8641292452812195, "learning_rate": 8.55855855855856e-06, "step_time_sec": 105.7 }, { "step": 100, "epoch": 0.05407963225850064, "wallclock": "2026-06-24T04:24:58.500503", "loss": 0.1148, "grad_norm": 0.7579247355461121, "learning_rate": 9.00900900900901e-06, "step_time_sec": 104.21, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 77.06 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 100, "epoch": 0.05407963225850064, "wallclock": "2026-06-24T04:26:38.179343", "eval_loss": 0.09886857122182846, "eval_runtime": 99.676, "eval_samples_per_second": 5.016, "eval_steps_per_second": 1.254, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 77.07 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 105, "epoch": 0.05678361387142567, "wallclock": "2026-06-24T04:29:47.276194", "loss": 0.1357, "grad_norm": 1.0714149475097656, "learning_rate": 9.45945945945946e-06, "step_time_sec": 288.78 }, { "step": 110, "epoch": 0.05948759548435071, "wallclock": "2026-06-24T04:31:33.882914", "loss": 0.1446, "grad_norm": 0.9079675078392029, "learning_rate": 9.90990990990991e-06, "step_time_sec": 106.61 }, { "step": 115, "epoch": 0.06219157709727574, "wallclock": "2026-06-24T04:33:20.601103", "loss": 0.1169, "grad_norm": 0.7082911729812622, "learning_rate": 9.999969317090495e-06, "step_time_sec": 106.72 }, { "step": 120, "epoch": 0.06489555871020077, "wallclock": "2026-06-24T04:35:06.071476", "loss": 0.1294, "grad_norm": 0.8290165066719055, "learning_rate": 9.99984466841603e-06, "step_time_sec": 105.47 }, { "step": 125, "epoch": 0.0675995403231258, "wallclock": "2026-06-24T04:36:51.012116", "loss": 0.0967, "grad_norm": 3.919275999069214, "learning_rate": 9.999624138683289e-06, "step_time_sec": 104.94 }, { "step": 130, "epoch": 0.07030352193605084, "wallclock": "2026-06-24T04:38:36.077195", "loss": 0.1226, "grad_norm": 0.9278262853622437, "learning_rate": 9.999307732121325e-06, "step_time_sec": 105.07 }, { "step": 135, "epoch": 0.07300750354897587, "wallclock": "2026-06-24T04:40:20.812264", "loss": 0.1098, "grad_norm": 0.7507790923118591, "learning_rate": 9.998895454797807e-06, "step_time_sec": 104.74 }, { "step": 140, "epoch": 0.0757114851619009, "wallclock": "2026-06-24T04:42:06.675192", "loss": 0.1013, "grad_norm": 0.42517712712287903, "learning_rate": 9.998387314618898e-06, "step_time_sec": 105.86 }, { "step": 145, "epoch": 0.07841546677482593, "wallclock": "2026-06-24T04:43:52.811351", "loss": 0.1198, "grad_norm": 0.9795618653297424, "learning_rate": 9.997783321329104e-06, "step_time_sec": 106.14 }, { "step": 150, "epoch": 0.08111944838775097, "wallclock": "2026-06-24T04:45:37.500620", "loss": 0.1097, "grad_norm": 0.8131667375564575, "learning_rate": 9.997083486511088e-06, "step_time_sec": 104.69, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 155, "epoch": 0.08382343000067599, "wallclock": "2026-06-24T04:47:23.099655", "loss": 0.1064, "grad_norm": 1.0449674129486084, "learning_rate": 9.996287823585446e-06, "step_time_sec": 105.6 }, { "step": 160, "epoch": 0.08652741161360103, "wallclock": "2026-06-24T04:49:08.308222", "loss": 0.1413, "grad_norm": 1.2222431898117065, "learning_rate": 9.995396347810456e-06, "step_time_sec": 105.21 }, { "step": 165, "epoch": 0.08923139322652605, "wallclock": "2026-06-24T04:50:53.614369", "loss": 0.095, "grad_norm": 0.6692535877227783, "learning_rate": 9.994409076281776e-06, "step_time_sec": 105.31 }, { "step": 170, "epoch": 0.09193537483945109, "wallclock": "2026-06-24T04:52:39.312428", "loss": 0.0987, "grad_norm": 0.7257323861122131, "learning_rate": 9.99332602793212e-06, "step_time_sec": 105.7 }, { "step": 175, "epoch": 0.09463935645237613, "wallclock": "2026-06-24T04:54:25.104575", "loss": 0.1172, "grad_norm": 0.694538414478302, "learning_rate": 9.992147223530901e-06, "step_time_sec": 105.79 }, { "step": 180, "epoch": 0.09734333806530115, "wallclock": "2026-06-24T04:56:09.832372", "loss": 0.0912, "grad_norm": 0.5451284646987915, "learning_rate": 9.99087268568382e-06, "step_time_sec": 104.73 }, { "step": 185, "epoch": 0.1000473196782262, "wallclock": "2026-06-24T04:57:55.711549", "loss": 0.1088, "grad_norm": 0.7407487034797668, "learning_rate": 9.989502438832447e-06, "step_time_sec": 105.88 }, { "step": 190, "epoch": 0.10275130129115122, "wallclock": "2026-06-24T04:59:41.200392", "loss": 0.1331, "grad_norm": 0.6032689809799194, "learning_rate": 9.988036509253742e-06, "step_time_sec": 105.49 }, { "step": 195, "epoch": 0.10545528290407626, "wallclock": "2026-06-24T05:01:24.975629", "loss": 0.091, "grad_norm": 0.7505941390991211, "learning_rate": 9.986474925059551e-06, "step_time_sec": 103.78 }, { "step": 200, "epoch": 0.10815926451700128, "wallclock": "2026-06-24T05:03:10.604700", "loss": 0.1116, "grad_norm": 0.6309108138084412, "learning_rate": 9.984817716196075e-06, "step_time_sec": 105.63, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 200, "epoch": 0.10815926451700128, "wallclock": "2026-06-24T05:04:50.512683", "eval_loss": 0.08997273445129395, "eval_runtime": 99.9051, "eval_samples_per_second": 5.005, "eval_steps_per_second": 1.251, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 205, "epoch": 0.11086324612992632, "wallclock": "2026-06-24T05:07:50.102603", "loss": 0.0916, "grad_norm": 0.4750153422355652, "learning_rate": 9.983064914443293e-06, "step_time_sec": 279.5 }, { "step": 210, "epoch": 0.11356722774285134, "wallclock": "2026-06-24T05:09:36.370868", "loss": 0.1137, "grad_norm": 0.6414338946342468, "learning_rate": 9.981216553414342e-06, "step_time_sec": 106.27 }, { "step": 215, "epoch": 0.11627120935577638, "wallclock": "2026-06-24T05:11:30.304900", "loss": 0.1261, "grad_norm": 0.7359138131141663, "learning_rate": 9.979272668554885e-06, "step_time_sec": 113.93 }, { "step": 220, "epoch": 0.11897519096870142, "wallclock": "2026-06-24T05:13:14.700481", "loss": 0.1016, "grad_norm": 0.8333423137664795, "learning_rate": 9.97723329714243e-06, "step_time_sec": 104.4 }, { "step": 225, "epoch": 0.12167917258162644, "wallclock": "2026-06-24T05:14:58.874076", "loss": 0.1084, "grad_norm": 0.7175215482711792, "learning_rate": 9.97509847828561e-06, "step_time_sec": 104.17 }, { "step": 230, "epoch": 0.12438315419455148, "wallclock": "2026-06-24T05:16:44.695629", "loss": 0.1165, "grad_norm": 0.5170373320579529, "learning_rate": 9.972868252923433e-06, "step_time_sec": 105.82 }, { "step": 235, "epoch": 0.12708713580747652, "wallclock": "2026-06-24T05:18:30.420350", "loss": 0.1014, "grad_norm": 1.0086610317230225, "learning_rate": 9.970542663824504e-06, "step_time_sec": 105.72 }, { "step": 240, "epoch": 0.12979111742040153, "wallclock": "2026-06-24T05:20:15.400259", "loss": 0.102, "grad_norm": 0.6341211199760437, "learning_rate": 9.968121755586196e-06, "step_time_sec": 104.98 }, { "step": 245, "epoch": 0.13249509903332657, "wallclock": "2026-06-24T05:22:01.027986", "loss": 0.1059, "grad_norm": 0.7365284562110901, "learning_rate": 9.965605574633798e-06, "step_time_sec": 105.63 }, { "step": 250, "epoch": 0.1351990806462516, "wallclock": "2026-06-24T05:23:47.012942", "loss": 0.0803, "grad_norm": 0.6329382061958313, "learning_rate": 9.96299416921963e-06, "step_time_sec": 105.98, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 255, "epoch": 0.13790306225917665, "wallclock": "2026-06-24T05:25:33.107736", "loss": 0.1085, "grad_norm": 0.6384260654449463, "learning_rate": 9.960287589422111e-06, "step_time_sec": 106.09 }, { "step": 260, "epoch": 0.14060704387210168, "wallclock": "2026-06-24T05:27:19.205894", "loss": 0.0953, "grad_norm": 0.7029681205749512, "learning_rate": 9.957485887144797e-06, "step_time_sec": 106.1 }, { "step": 265, "epoch": 0.1433110254850267, "wallclock": "2026-06-24T05:29:05.191995", "loss": 0.0855, "grad_norm": 0.7882628440856934, "learning_rate": 9.954589116115398e-06, "step_time_sec": 105.99 }, { "step": 270, "epoch": 0.14601500709795173, "wallclock": "2026-06-24T05:30:49.706053", "loss": 0.0996, "grad_norm": 0.8211791515350342, "learning_rate": 9.95159733188473e-06, "step_time_sec": 104.51 }, { "step": 275, "epoch": 0.14871898871087677, "wallclock": "2026-06-24T05:32:37.121576", "loss": 0.1183, "grad_norm": 0.8806095719337463, "learning_rate": 9.948510591825666e-06, "step_time_sec": 107.42 }, { "step": 280, "epoch": 0.1514229703238018, "wallclock": "2026-06-24T05:34:21.610766", "loss": 0.0954, "grad_norm": 0.7867270708084106, "learning_rate": 9.945328955132023e-06, "step_time_sec": 104.49 }, { "step": 285, "epoch": 0.15412695193672682, "wallclock": "2026-06-24T05:36:07.329379", "loss": 0.0886, "grad_norm": 0.7445922493934631, "learning_rate": 9.942052482817436e-06, "step_time_sec": 105.72 }, { "step": 290, "epoch": 0.15683093354965186, "wallclock": "2026-06-24T05:37:53.691605", "loss": 0.0762, "grad_norm": 0.4893661141395569, "learning_rate": 9.938681237714186e-06, "step_time_sec": 106.36 }, { "step": 295, "epoch": 0.1595349151625769, "wallclock": "2026-06-24T05:39:38.617866", "loss": 0.1037, "grad_norm": 0.7313506603240967, "learning_rate": 9.935215284471989e-06, "step_time_sec": 104.93 }, { "step": 300, "epoch": 0.16223889677550193, "wallclock": "2026-06-24T05:41:23.828815", "loss": 0.0868, "grad_norm": 0.7617091536521912, "learning_rate": 9.93165468955676e-06, "step_time_sec": 105.21, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 300, "epoch": 0.16223889677550193, "wallclock": "2026-06-24T05:43:03.826680", "eval_loss": 0.0827580988407135, "eval_runtime": 99.9942, "eval_samples_per_second": 5.0, "eval_steps_per_second": 1.25, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 305, "epoch": 0.16494287838842697, "wallclock": "2026-06-24T05:46:13.578643", "loss": 0.074, "grad_norm": 0.4504067301750183, "learning_rate": 9.927999521249347e-06, "step_time_sec": 289.75 }, { "step": 310, "epoch": 0.16764686000135198, "wallclock": "2026-06-24T05:48:00.303575", "loss": 0.1073, "grad_norm": 0.6431950330734253, "learning_rate": 9.924249849644205e-06, "step_time_sec": 106.72 }, { "step": 315, "epoch": 0.17035084161427702, "wallclock": "2026-06-24T05:49:45.680208", "loss": 0.1112, "grad_norm": 0.9043431878089905, "learning_rate": 9.920405746648067e-06, "step_time_sec": 105.38 }, { "step": 320, "epoch": 0.17305482322720206, "wallclock": "2026-06-24T05:51:32.404247", "loss": 0.0764, "grad_norm": 0.6045661568641663, "learning_rate": 9.916467285978556e-06, "step_time_sec": 106.72 }, { "step": 325, "epoch": 0.1757588048401271, "wallclock": "2026-06-24T05:53:18.181343", "loss": 0.0956, "grad_norm": 0.8464241623878479, "learning_rate": 9.912434543162769e-06, "step_time_sec": 105.78 }, { "step": 330, "epoch": 0.1784627864530521, "wallclock": "2026-06-24T05:55:03.598110", "loss": 0.1038, "grad_norm": 0.43105682730674744, "learning_rate": 9.908307595535842e-06, "step_time_sec": 105.42 }, { "step": 335, "epoch": 0.18116676806597715, "wallclock": "2026-06-24T05:56:50.010277", "loss": 0.1136, "grad_norm": 0.4054422676563263, "learning_rate": 9.904086522239455e-06, "step_time_sec": 106.41 }, { "step": 340, "epoch": 0.18387074967890218, "wallclock": "2026-06-24T05:58:35.002723", "loss": 0.081, "grad_norm": 0.7263162732124329, "learning_rate": 9.899771404220318e-06, "step_time_sec": 104.99 }, { "step": 345, "epoch": 0.18657473129182722, "wallclock": "2026-06-24T06:00:19.132697", "loss": 0.1075, "grad_norm": 1.7756342887878418, "learning_rate": 9.895362324228616e-06, "step_time_sec": 104.13 }, { "step": 350, "epoch": 0.18927871290475226, "wallclock": "2026-06-24T06:02:04.923652", "loss": 0.0862, "grad_norm": 0.4385850131511688, "learning_rate": 9.890859366816429e-06, "step_time_sec": 105.79, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 355, "epoch": 0.19198269451767727, "wallclock": "2026-06-24T06:03:51.291232", "loss": 0.1074, "grad_norm": 0.5257728099822998, "learning_rate": 9.886262618336103e-06, "step_time_sec": 106.37 }, { "step": 360, "epoch": 0.1946866761306023, "wallclock": "2026-06-24T06:05:36.306087", "loss": 0.1147, "grad_norm": 0.6658884882926941, "learning_rate": 9.881572166938598e-06, "step_time_sec": 105.01 }, { "step": 365, "epoch": 0.19739065774352735, "wallclock": "2026-06-24T06:07:22.917361", "loss": 0.0827, "grad_norm": 0.5998860597610474, "learning_rate": 9.876788102571797e-06, "step_time_sec": 106.61 }, { "step": 370, "epoch": 0.2000946393564524, "wallclock": "2026-06-24T06:09:08.111845", "loss": 0.1197, "grad_norm": 0.8167080879211426, "learning_rate": 9.871910516978782e-06, "step_time_sec": 105.19 }, { "step": 375, "epoch": 0.2027986209693774, "wallclock": "2026-06-24T06:10:53.235112", "loss": 0.1089, "grad_norm": 0.8197498321533203, "learning_rate": 9.86693950369607e-06, "step_time_sec": 105.12 }, { "step": 380, "epoch": 0.20550260258230243, "wallclock": "2026-06-24T06:12:39.106309", "loss": 0.0881, "grad_norm": 0.5486798882484436, "learning_rate": 9.861875158051831e-06, "step_time_sec": 105.87 }, { "step": 385, "epoch": 0.20820658419522747, "wallclock": "2026-06-24T06:14:24.784134", "loss": 0.0849, "grad_norm": 0.6048823595046997, "learning_rate": 9.85671757716404e-06, "step_time_sec": 105.68 }, { "step": 390, "epoch": 0.2109105658081525, "wallclock": "2026-06-24T06:16:09.415638", "loss": 0.1078, "grad_norm": 0.4019126296043396, "learning_rate": 9.851466859938637e-06, "step_time_sec": 104.63 }, { "step": 395, "epoch": 0.21361454742107755, "wallclock": "2026-06-24T06:17:56.691186", "loss": 0.1119, "grad_norm": 0.6954424381256104, "learning_rate": 9.84612310706761e-06, "step_time_sec": 107.28 }, { "step": 400, "epoch": 0.21631852903400256, "wallclock": "2026-06-24T06:19:42.292515", "loss": 0.0945, "grad_norm": 0.6359832882881165, "learning_rate": 9.840686421027085e-06, "step_time_sec": 105.6, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 400, "epoch": 0.21631852903400256, "wallclock": "2026-06-24T06:21:22.500688", "eval_loss": 0.08142668008804321, "eval_runtime": 100.2043, "eval_samples_per_second": 4.99, "eval_steps_per_second": 1.247, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 405, "epoch": 0.2190225106469276, "wallclock": "2026-06-24T06:24:29.504207", "loss": 0.0882, "grad_norm": 0.6086682081222534, "learning_rate": 9.835156906075338e-06, "step_time_sec": 287.21 }, { "step": 410, "epoch": 0.22172649225985264, "wallclock": "2026-06-24T06:26:16.103755", "loss": 0.1007, "grad_norm": 1.0316163301467896, "learning_rate": 9.829534668250814e-06, "step_time_sec": 106.6 }, { "step": 415, "epoch": 0.22443047387277767, "wallclock": "2026-06-24T06:28:01.409742", "loss": 0.093, "grad_norm": 0.7095230221748352, "learning_rate": 9.823819815370084e-06, "step_time_sec": 105.31 }, { "step": 420, "epoch": 0.22713445548570269, "wallclock": "2026-06-24T06:29:47.402659", "loss": 0.1019, "grad_norm": 0.7305953502655029, "learning_rate": 9.818012457025782e-06, "step_time_sec": 105.99 }, { "step": 425, "epoch": 0.22983843709862772, "wallclock": "2026-06-24T06:31:34.377021", "loss": 0.1021, "grad_norm": 0.5319082140922546, "learning_rate": 9.812112704584503e-06, "step_time_sec": 106.97 }, { "step": 430, "epoch": 0.23254241871155276, "wallclock": "2026-06-24T06:33:19.008040", "loss": 0.1063, "grad_norm": 0.8568723797798157, "learning_rate": 9.806120671184658e-06, "step_time_sec": 104.63 }, { "step": 435, "epoch": 0.2352464003244778, "wallclock": "2026-06-24T06:35:03.722758", "loss": 0.0605, "grad_norm": 0.45772790908813477, "learning_rate": 9.80003647173432e-06, "step_time_sec": 104.71 }, { "step": 440, "epoch": 0.23795038193740284, "wallclock": "2026-06-24T06:36:50.225935", "loss": 0.0991, "grad_norm": 0.7904582023620605, "learning_rate": 9.793860222909012e-06, "step_time_sec": 106.5 }, { "step": 445, "epoch": 0.24065436355032785, "wallclock": "2026-06-24T06:38:35.809144", "loss": 0.0913, "grad_norm": 0.5529101490974426, "learning_rate": 9.787592043149467e-06, "step_time_sec": 105.58 }, { "step": 450, "epoch": 0.2433583451632529, "wallclock": "2026-06-24T06:40:20.203877", "loss": 0.0679, "grad_norm": 0.7900363206863403, "learning_rate": 9.78123205265936e-06, "step_time_sec": 104.39, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 455, "epoch": 0.24606232677617793, "wallclock": "2026-06-24T06:42:06.492257", "loss": 0.1056, "grad_norm": 0.6831784248352051, "learning_rate": 9.774780373403003e-06, "step_time_sec": 106.29 }, { "step": 460, "epoch": 0.24876630838910296, "wallclock": "2026-06-24T06:43:52.409118", "loss": 0.0933, "grad_norm": 0.5478017330169678, "learning_rate": 9.768237129103009e-06, "step_time_sec": 105.92 }, { "step": 465, "epoch": 0.251470290002028, "wallclock": "2026-06-24T06:45:37.500906", "loss": 0.089, "grad_norm": 0.4542732238769531, "learning_rate": 9.761602445237914e-06, "step_time_sec": 105.09 }, { "step": 470, "epoch": 0.25417427161495304, "wallclock": "2026-06-24T06:47:23.183901", "loss": 0.1057, "grad_norm": 2.437464714050293, "learning_rate": 9.75487644903977e-06, "step_time_sec": 105.68 }, { "step": 475, "epoch": 0.25687825322787805, "wallclock": "2026-06-24T06:49:09.011851", "loss": 0.0774, "grad_norm": 0.6896166801452637, "learning_rate": 9.748059269491711e-06, "step_time_sec": 105.83 }, { "step": 480, "epoch": 0.25958223484080306, "wallclock": "2026-06-24T06:50:54.196439", "loss": 0.0913, "grad_norm": 0.5685729384422302, "learning_rate": 9.741151037325481e-06, "step_time_sec": 105.18 }, { "step": 485, "epoch": 0.2622862164537281, "wallclock": "2026-06-24T06:52:39.978133", "loss": 0.086, "grad_norm": 0.8516511917114258, "learning_rate": 9.73415188501891e-06, "step_time_sec": 105.78 }, { "step": 490, "epoch": 0.26499019806665314, "wallclock": "2026-06-24T06:54:27.309828", "loss": 0.0872, "grad_norm": 0.7482581734657288, "learning_rate": 9.727061946793402e-06, "step_time_sec": 107.33 }, { "step": 495, "epoch": 0.2676941796795782, "wallclock": "2026-06-24T06:56:12.188135", "loss": 0.0733, "grad_norm": 0.544495701789856, "learning_rate": 9.71988135861133e-06, "step_time_sec": 104.88 }, { "step": 500, "epoch": 0.2703981612925032, "wallclock": "2026-06-24T06:57:57.321125", "loss": 0.0771, "grad_norm": 0.6160959005355835, "learning_rate": 9.712610258173453e-06, "step_time_sec": 105.13, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.98 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 500, "epoch": 0.2703981612925032, "wallclock": "2026-06-24T06:59:37.554773", "eval_loss": 0.0792667418718338, "eval_runtime": 100.2297, "eval_samples_per_second": 4.989, "eval_steps_per_second": 1.247, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 505, "epoch": 0.2731021429054282, "wallclock": "2026-06-24T07:02:45.894476", "loss": 0.0798, "grad_norm": 0.8482615351676941, "learning_rate": 9.705248784916267e-06, "step_time_sec": 288.57 }, { "step": 510, "epoch": 0.2758061245183533, "wallclock": "2026-06-24T07:04:30.505457", "loss": 0.0955, "grad_norm": 0.5648516416549683, "learning_rate": 9.697797080009323e-06, "step_time_sec": 104.61 }, { "step": 515, "epoch": 0.2785101061312783, "wallclock": "2026-06-24T07:06:15.804916", "loss": 0.082, "grad_norm": 0.6227542757987976, "learning_rate": 9.690255286352532e-06, "step_time_sec": 105.3 }, { "step": 520, "epoch": 0.28121408774420337, "wallclock": "2026-06-24T07:08:01.704738", "loss": 0.1104, "grad_norm": 0.7219036221504211, "learning_rate": 9.682623548573418e-06, "step_time_sec": 105.9 }, { "step": 525, "epoch": 0.2839180693571284, "wallclock": "2026-06-24T07:09:47.411077", "loss": 0.0873, "grad_norm": 0.5870639681816101, "learning_rate": 9.674902013024348e-06, "step_time_sec": 105.71 }, { "step": 530, "epoch": 0.2866220509700534, "wallclock": "2026-06-24T07:11:33.115669", "loss": 0.1001, "grad_norm": 0.5214188694953918, "learning_rate": 9.667090827779721e-06, "step_time_sec": 105.7 }, { "step": 535, "epoch": 0.28932603258297845, "wallclock": "2026-06-24T07:13:18.886097", "loss": 0.0888, "grad_norm": 0.5477219223976135, "learning_rate": 9.659190142633133e-06, "step_time_sec": 105.77 }, { "step": 540, "epoch": 0.29203001419590346, "wallclock": "2026-06-24T07:15:03.294899", "loss": 0.0893, "grad_norm": 0.6372500061988831, "learning_rate": 9.651200109094498e-06, "step_time_sec": 104.41 }, { "step": 545, "epoch": 0.2947339958088285, "wallclock": "2026-06-24T07:16:49.807494", "loss": 0.0706, "grad_norm": 0.6478589177131653, "learning_rate": 9.643120880387155e-06, "step_time_sec": 106.51 }, { "step": 550, "epoch": 0.29743797742175354, "wallclock": "2026-06-24T07:18:36.819978", "loss": 0.0848, "grad_norm": 0.7352571487426758, "learning_rate": 9.634952611444914e-06, "step_time_sec": 107.01, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 555, "epoch": 0.30014195903467855, "wallclock": "2026-06-24T07:20:22.813541", "loss": 0.0956, "grad_norm": 0.8457625508308411, "learning_rate": 9.626695458909098e-06, "step_time_sec": 105.99 }, { "step": 560, "epoch": 0.3028459406476036, "wallclock": "2026-06-24T07:22:08.475130", "loss": 0.082, "grad_norm": 0.8473530411720276, "learning_rate": 9.618349581125529e-06, "step_time_sec": 105.66 }, { "step": 565, "epoch": 0.30554992226052863, "wallclock": "2026-06-24T07:23:54.730055", "loss": 0.0666, "grad_norm": 0.7220405340194702, "learning_rate": 9.609915138141497e-06, "step_time_sec": 106.25 }, { "step": 570, "epoch": 0.30825390387345364, "wallclock": "2026-06-24T07:25:39.415962", "loss": 0.0714, "grad_norm": 0.6538407206535339, "learning_rate": 9.601392291702693e-06, "step_time_sec": 104.69 }, { "step": 575, "epoch": 0.3109578854863787, "wallclock": "2026-06-24T07:27:23.791188", "loss": 0.073, "grad_norm": 0.851050853729248, "learning_rate": 9.592781205250102e-06, "step_time_sec": 104.38 }, { "step": 580, "epoch": 0.3136618670993037, "wallclock": "2026-06-24T07:29:08.982414", "loss": 0.0972, "grad_norm": 0.7455153465270996, "learning_rate": 9.584082043916867e-06, "step_time_sec": 105.19 }, { "step": 585, "epoch": 0.3163658487122288, "wallclock": "2026-06-24T07:30:54.304933", "loss": 0.0728, "grad_norm": 0.39666956663131714, "learning_rate": 9.575294974525131e-06, "step_time_sec": 105.32 }, { "step": 590, "epoch": 0.3190698303251538, "wallclock": "2026-06-24T07:32:40.198440", "loss": 0.0719, "grad_norm": 0.283635675907135, "learning_rate": 9.566420165582832e-06, "step_time_sec": 105.89 }, { "step": 595, "epoch": 0.3217738119380788, "wallclock": "2026-06-24T07:34:26.091391", "loss": 0.0892, "grad_norm": 0.6910920739173889, "learning_rate": 9.557457787280474e-06, "step_time_sec": 105.89 }, { "step": 600, "epoch": 0.32447779355100387, "wallclock": "2026-06-24T07:36:12.209290", "loss": 0.0895, "grad_norm": 0.6658245325088501, "learning_rate": 9.548408011487857e-06, "step_time_sec": 106.12, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 600, "epoch": 0.32447779355100387, "wallclock": "2026-06-24T07:37:52.689311", "eval_loss": 0.07775916159152985, "eval_runtime": 100.4763, "eval_samples_per_second": 4.976, "eval_steps_per_second": 1.244, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 605, "epoch": 0.3271817751639289, "wallclock": "2026-06-24T07:40:59.587964", "loss": 0.0984, "grad_norm": 0.6905266046524048, "learning_rate": 9.539271011750787e-06, "step_time_sec": 287.38 }, { "step": 610, "epoch": 0.32988575677685394, "wallclock": "2026-06-24T07:42:47.180462", "loss": 0.0823, "grad_norm": 0.6062604784965515, "learning_rate": 9.530046963287753e-06, "step_time_sec": 107.59 }, { "step": 615, "epoch": 0.33258973838977895, "wallclock": "2026-06-24T07:44:31.276448", "loss": 0.0767, "grad_norm": 0.7175081968307495, "learning_rate": 9.520736042986551e-06, "step_time_sec": 104.1 }, { "step": 620, "epoch": 0.33529372000270397, "wallclock": "2026-06-24T07:46:15.885770", "loss": 0.0882, "grad_norm": 0.898894190788269, "learning_rate": 9.51133842940091e-06, "step_time_sec": 104.61 }, { "step": 625, "epoch": 0.33799770161562903, "wallclock": "2026-06-24T07:48:01.530170", "loss": 0.0873, "grad_norm": 0.5385039448738098, "learning_rate": 9.501854302747053e-06, "step_time_sec": 105.64 }, { "step": 630, "epoch": 0.34070168322855404, "wallclock": "2026-06-24T07:49:46.498449", "loss": 0.0817, "grad_norm": 0.5420588850975037, "learning_rate": 9.492283844900255e-06, "step_time_sec": 104.97 }, { "step": 635, "epoch": 0.34340566484147905, "wallclock": "2026-06-24T07:51:31.809385", "loss": 0.0879, "grad_norm": 1.3086037635803223, "learning_rate": 9.482627239391335e-06, "step_time_sec": 105.31 }, { "step": 640, "epoch": 0.3461096464544041, "wallclock": "2026-06-24T07:53:18.306307", "loss": 0.0728, "grad_norm": 0.6617655158042908, "learning_rate": 9.472884671403164e-06, "step_time_sec": 106.5 }, { "step": 645, "epoch": 0.34881362806732913, "wallclock": "2026-06-24T07:55:03.697153", "loss": 0.0593, "grad_norm": 0.6209415197372437, "learning_rate": 9.46305632776709e-06, "step_time_sec": 105.39 }, { "step": 650, "epoch": 0.3515176096802542, "wallclock": "2026-06-24T07:56:48.600267", "loss": 0.0816, "grad_norm": 1.021694302558899, "learning_rate": 9.453142396959364e-06, "step_time_sec": 104.9, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 655, "epoch": 0.3542215912931792, "wallclock": "2026-06-24T07:58:33.519930", "loss": 0.0802, "grad_norm": 0.8565160036087036, "learning_rate": 9.443143069097531e-06, "step_time_sec": 104.92 }, { "step": 660, "epoch": 0.3569255729061042, "wallclock": "2026-06-24T08:00:17.709412", "loss": 0.073, "grad_norm": 0.8225128650665283, "learning_rate": 9.433058535936775e-06, "step_time_sec": 104.19 }, { "step": 665, "epoch": 0.3596295545190293, "wallclock": "2026-06-24T08:02:02.897175", "loss": 0.0805, "grad_norm": 0.8371864557266235, "learning_rate": 9.422888990866243e-06, "step_time_sec": 105.19 }, { "step": 670, "epoch": 0.3623335361319543, "wallclock": "2026-06-24T08:03:49.320420", "loss": 0.0855, "grad_norm": 0.6681428551673889, "learning_rate": 9.412634628905345e-06, "step_time_sec": 106.42 }, { "step": 675, "epoch": 0.36503751774487936, "wallclock": "2026-06-24T08:05:33.903831", "loss": 0.0768, "grad_norm": 0.6769019365310669, "learning_rate": 9.402295646700005e-06, "step_time_sec": 104.58 }, { "step": 680, "epoch": 0.36774149935780437, "wallclock": "2026-06-24T08:07:19.895810", "loss": 0.0829, "grad_norm": 0.5479181408882141, "learning_rate": 9.391872242518895e-06, "step_time_sec": 105.99 }, { "step": 685, "epoch": 0.3704454809707294, "wallclock": "2026-06-24T08:09:05.010792", "loss": 0.0745, "grad_norm": 0.499809205532074, "learning_rate": 9.381364616249627e-06, "step_time_sec": 105.11 }, { "step": 690, "epoch": 0.37314946258365445, "wallclock": "2026-06-24T08:10:50.298200", "loss": 0.0735, "grad_norm": 1.0203771591186523, "learning_rate": 9.370772969394927e-06, "step_time_sec": 105.29 }, { "step": 695, "epoch": 0.37585344419657946, "wallclock": "2026-06-24T08:12:35.415602", "loss": 0.0705, "grad_norm": 0.7761706113815308, "learning_rate": 9.360097505068767e-06, "step_time_sec": 105.12 }, { "step": 700, "epoch": 0.3785574258095045, "wallclock": "2026-06-24T08:14:21.710932", "loss": 0.0984, "grad_norm": 2.708282709121704, "learning_rate": 9.349338427992471e-06, "step_time_sec": 106.3, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 700, "epoch": 0.3785574258095045, "wallclock": "2026-06-24T08:16:02.035763", "eval_loss": 0.07459608465433121, "eval_runtime": 100.3142, "eval_samples_per_second": 4.984, "eval_steps_per_second": 1.246, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 705, "epoch": 0.38126140742242953, "wallclock": "2026-06-24T08:19:08.313913", "loss": 0.0745, "grad_norm": 0.4694831073284149, "learning_rate": 9.338495944490788e-06, "step_time_sec": 286.6 }, { "step": 710, "epoch": 0.38396538903535454, "wallclock": "2026-06-24T08:20:53.768411", "loss": 0.1006, "grad_norm": 0.715364396572113, "learning_rate": 9.327570262487934e-06, "step_time_sec": 105.45 }, { "step": 715, "epoch": 0.3866693706482796, "wallclock": "2026-06-24T08:22:39.591734", "loss": 0.0863, "grad_norm": 0.5401411056518555, "learning_rate": 9.316561591503612e-06, "step_time_sec": 105.82 }, { "step": 720, "epoch": 0.3893733522612046, "wallclock": "2026-06-24T08:24:24.650372", "loss": 0.0955, "grad_norm": 0.5890225768089294, "learning_rate": 9.305470142648982e-06, "step_time_sec": 105.06 }, { "step": 725, "epoch": 0.39207733387412963, "wallclock": "2026-06-24T08:26:09.603072", "loss": 0.0733, "grad_norm": 0.6464399695396423, "learning_rate": 9.294296128622625e-06, "step_time_sec": 104.95 }, { "step": 730, "epoch": 0.3947813154870547, "wallclock": "2026-06-24T08:27:56.312801", "loss": 0.0723, "grad_norm": 0.5359171628952026, "learning_rate": 9.283039763706455e-06, "step_time_sec": 106.71 }, { "step": 735, "epoch": 0.3974852970999797, "wallclock": "2026-06-24T08:29:41.888724", "loss": 0.0767, "grad_norm": 0.7463257908821106, "learning_rate": 9.27170126376161e-06, "step_time_sec": 105.58 }, { "step": 740, "epoch": 0.4001892787129048, "wallclock": "2026-06-24T08:31:26.792262", "loss": 0.0696, "grad_norm": 0.8311108946800232, "learning_rate": 9.260280846224328e-06, "step_time_sec": 104.9 }, { "step": 745, "epoch": 0.4028932603258298, "wallclock": "2026-06-24T08:33:12.308668", "loss": 0.0674, "grad_norm": 0.7888720631599426, "learning_rate": 9.24877873010175e-06, "step_time_sec": 105.52 }, { "step": 750, "epoch": 0.4055972419387548, "wallclock": "2026-06-24T08:34:57.915757", "loss": 0.0689, "grad_norm": 0.5358040928840637, "learning_rate": 9.237195135967746e-06, "step_time_sec": 105.61, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 755, "epoch": 0.40830122355167986, "wallclock": "2026-06-24T08:36:43.108457", "loss": 0.0991, "grad_norm": 0.7390076518058777, "learning_rate": 9.225530285958669e-06, "step_time_sec": 105.19 }, { "step": 760, "epoch": 0.41100520516460487, "wallclock": "2026-06-24T08:38:29.912596", "loss": 0.0848, "grad_norm": 0.6667785048484802, "learning_rate": 9.213784403769097e-06, "step_time_sec": 106.8 }, { "step": 765, "epoch": 0.41370918677752994, "wallclock": "2026-06-24T08:40:15.181668", "loss": 0.064, "grad_norm": 1.133137583732605, "learning_rate": 9.201957714647554e-06, "step_time_sec": 105.27 }, { "step": 770, "epoch": 0.41641316839045495, "wallclock": "2026-06-24T08:42:01.118538", "loss": 0.0654, "grad_norm": 0.8551876544952393, "learning_rate": 9.19005044539218e-06, "step_time_sec": 105.94 }, { "step": 775, "epoch": 0.41911715000337996, "wallclock": "2026-06-24T08:43:47.610874", "loss": 0.0846, "grad_norm": 0.7466854453086853, "learning_rate": 9.178062824346383e-06, "step_time_sec": 106.49 }, { "step": 780, "epoch": 0.421821131616305, "wallclock": "2026-06-24T08:45:33.303287", "loss": 0.0912, "grad_norm": 0.5469369292259216, "learning_rate": 9.165995081394463e-06, "step_time_sec": 105.69 }, { "step": 785, "epoch": 0.42452511322923003, "wallclock": "2026-06-24T08:47:18.578926", "loss": 0.0864, "grad_norm": 0.9799915552139282, "learning_rate": 9.153847447957205e-06, "step_time_sec": 105.28 }, { "step": 790, "epoch": 0.4272290948421551, "wallclock": "2026-06-24T08:49:06.294584", "loss": 0.0954, "grad_norm": 0.6794901490211487, "learning_rate": 9.141620156987432e-06, "step_time_sec": 107.72 }, { "step": 795, "epoch": 0.4299330764550801, "wallclock": "2026-06-24T08:50:52.095214", "loss": 0.0547, "grad_norm": 0.6824802160263062, "learning_rate": 9.12931344296555e-06, "step_time_sec": 105.8 }, { "step": 800, "epoch": 0.4326370580680051, "wallclock": "2026-06-24T08:52:36.490555", "loss": 0.0677, "grad_norm": 0.5517615675926208, "learning_rate": 9.116927541895042e-06, "step_time_sec": 104.4, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 800, "epoch": 0.4326370580680051, "wallclock": "2026-06-24T08:54:16.950971", "eval_loss": 0.07352492958307266, "eval_runtime": 100.4561, "eval_samples_per_second": 4.977, "eval_steps_per_second": 1.244, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 805, "epoch": 0.4353410396809302, "wallclock": "2026-06-24T08:57:27.878085", "loss": 0.0604, "grad_norm": 0.561560869216919, "learning_rate": 9.10446269129795e-06, "step_time_sec": 291.39 }, { "step": 810, "epoch": 0.4380450212938552, "wallclock": "2026-06-24T08:59:12.110183", "loss": 0.0793, "grad_norm": 0.784087061882019, "learning_rate": 9.091919130210313e-06, "step_time_sec": 104.23 }, { "step": 815, "epoch": 0.4407490029067802, "wallclock": "2026-06-24T09:00:57.619395", "loss": 0.0523, "grad_norm": 0.47488337755203247, "learning_rate": 9.079297099177585e-06, "step_time_sec": 105.51 }, { "step": 820, "epoch": 0.4434529845197053, "wallclock": "2026-06-24T09:02:44.201504", "loss": 0.092, "grad_norm": 0.6607430577278137, "learning_rate": 9.066596840250024e-06, "step_time_sec": 106.58 }, { "step": 825, "epoch": 0.4461569661326303, "wallclock": "2026-06-24T09:04:30.495107", "loss": 0.0559, "grad_norm": 0.5975196361541748, "learning_rate": 9.053818596978051e-06, "step_time_sec": 106.29 }, { "step": 830, "epoch": 0.44886094774555535, "wallclock": "2026-06-24T09:06:14.400764", "loss": 0.0749, "grad_norm": 0.5973978042602539, "learning_rate": 9.040962614407574e-06, "step_time_sec": 103.91 }, { "step": 835, "epoch": 0.45156492935848036, "wallclock": "2026-06-24T09:08:00.823060", "loss": 0.0673, "grad_norm": 0.8808339238166809, "learning_rate": 9.028029139075297e-06, "step_time_sec": 106.42 }, { "step": 840, "epoch": 0.45426891097140537, "wallclock": "2026-06-24T09:09:46.093782", "loss": 0.0975, "grad_norm": 0.9540690779685974, "learning_rate": 9.015018419003982e-06, "step_time_sec": 105.27 }, { "step": 845, "epoch": 0.45697289258433044, "wallclock": "2026-06-24T09:11:31.785014", "loss": 0.0815, "grad_norm": 0.7579560875892639, "learning_rate": 9.001930703697708e-06, "step_time_sec": 105.69 }, { "step": 850, "epoch": 0.45967687419725545, "wallclock": "2026-06-24T09:13:18.016135", "loss": 0.077, "grad_norm": 1.2188389301300049, "learning_rate": 8.988766244137065e-06, "step_time_sec": 106.23, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 855, "epoch": 0.4623808558101805, "wallclock": "2026-06-24T09:15:04.661348", "loss": 0.0777, "grad_norm": 0.7465812563896179, "learning_rate": 8.975525292774362e-06, "step_time_sec": 106.65 }, { "step": 860, "epoch": 0.4650848374231055, "wallclock": "2026-06-24T09:16:51.200338", "loss": 0.065, "grad_norm": 0.634141206741333, "learning_rate": 8.962208103528774e-06, "step_time_sec": 106.54 }, { "step": 865, "epoch": 0.46778881903603053, "wallclock": "2026-06-24T09:18:37.705663", "loss": 0.0723, "grad_norm": 0.5434172749519348, "learning_rate": 8.948814931781472e-06, "step_time_sec": 106.51 }, { "step": 870, "epoch": 0.4704928006489556, "wallclock": "2026-06-24T09:20:22.682651", "loss": 0.0674, "grad_norm": 0.851901650428772, "learning_rate": 8.935346034370732e-06, "step_time_sec": 104.98 }, { "step": 875, "epoch": 0.4731967822618806, "wallclock": "2026-06-24T09:22:08.000109", "loss": 0.0648, "grad_norm": 0.5568099617958069, "learning_rate": 8.921801669587005e-06, "step_time_sec": 105.32 }, { "step": 880, "epoch": 0.4759007638748057, "wallclock": "2026-06-24T09:23:53.107265", "loss": 0.073, "grad_norm": 0.726121723651886, "learning_rate": 8.908182097167965e-06, "step_time_sec": 105.11 }, { "step": 885, "epoch": 0.4786047454877307, "wallclock": "2026-06-24T09:25:38.993222", "loss": 0.0676, "grad_norm": 0.6540066003799438, "learning_rate": 8.894487578293534e-06, "step_time_sec": 105.89 }, { "step": 890, "epoch": 0.4813087271006557, "wallclock": "2026-06-24T09:27:25.285677", "loss": 0.0699, "grad_norm": 0.5976990461349487, "learning_rate": 8.880718375580857e-06, "step_time_sec": 106.29 }, { "step": 895, "epoch": 0.48401270871358076, "wallclock": "2026-06-24T09:29:12.234716", "loss": 0.0687, "grad_norm": 0.5673884749412537, "learning_rate": 8.866874753079286e-06, "step_time_sec": 106.95 }, { "step": 900, "epoch": 0.4867166903265058, "wallclock": "2026-06-24T09:30:57.290947", "loss": 0.0865, "grad_norm": 0.825077474117279, "learning_rate": 8.852956976265304e-06, "step_time_sec": 105.06, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 900, "epoch": 0.4867166903265058, "wallclock": "2026-06-24T09:32:37.746275", "eval_loss": 0.0734986960887909, "eval_runtime": 100.4508, "eval_samples_per_second": 4.978, "eval_steps_per_second": 1.244, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 905, "epoch": 0.4894206719394308, "wallclock": "2026-06-24T09:35:44.600337", "loss": 0.0661, "grad_norm": 0.6024107933044434, "learning_rate": 8.838965312037435e-06, "step_time_sec": 287.31 }, { "step": 910, "epoch": 0.49212465355235585, "wallclock": "2026-06-24T09:37:31.615349", "loss": 0.0864, "grad_norm": 0.7744714617729187, "learning_rate": 8.824900028711128e-06, "step_time_sec": 107.02 }, { "step": 915, "epoch": 0.49482863516528086, "wallclock": "2026-06-24T09:39:17.106852", "loss": 0.0608, "grad_norm": 0.6232128143310547, "learning_rate": 8.810761396013616e-06, "step_time_sec": 105.49 }, { "step": 920, "epoch": 0.4975326167782059, "wallclock": "2026-06-24T09:41:01.831415", "loss": 0.0667, "grad_norm": 0.7434114217758179, "learning_rate": 8.796549685078732e-06, "step_time_sec": 104.72 }, { "step": 925, "epoch": 0.500236598391131, "wallclock": "2026-06-24T09:42:49.107296", "loss": 0.0545, "grad_norm": 0.5467560887336731, "learning_rate": 8.782265168441722e-06, "step_time_sec": 107.28 }, { "step": 930, "epoch": 0.502940580004056, "wallclock": "2026-06-24T09:44:33.913707", "loss": 0.0608, "grad_norm": 0.49254247546195984, "learning_rate": 8.76790812003401e-06, "step_time_sec": 104.81 }, { "step": 935, "epoch": 0.505644561616981, "wallclock": "2026-06-24T09:46:19.229213", "loss": 0.0684, "grad_norm": 0.48784705996513367, "learning_rate": 8.753478815177947e-06, "step_time_sec": 105.32 }, { "step": 940, "epoch": 0.5083485432299061, "wallclock": "2026-06-24T09:48:05.411373", "loss": 0.0731, "grad_norm": 0.48523765802383423, "learning_rate": 8.738977530581534e-06, "step_time_sec": 106.18 }, { "step": 945, "epoch": 0.511052524842831, "wallclock": "2026-06-24T09:49:51.295620", "loss": 0.0843, "grad_norm": 1.2344911098480225, "learning_rate": 8.724404544333111e-06, "step_time_sec": 105.88 }, { "step": 950, "epoch": 0.5137565064557561, "wallclock": "2026-06-24T09:51:36.494251", "loss": 0.0754, "grad_norm": 0.9410877227783203, "learning_rate": 8.709760135896033e-06, "step_time_sec": 105.2, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 955, "epoch": 0.5164604880686812, "wallclock": "2026-06-24T09:53:21.420428", "loss": 0.0843, "grad_norm": 0.3976033329963684, "learning_rate": 8.695044586103297e-06, "step_time_sec": 104.93 }, { "step": 960, "epoch": 0.5191644696816061, "wallclock": "2026-06-24T09:55:05.899797", "loss": 0.0742, "grad_norm": 0.7704766392707825, "learning_rate": 8.680258177152166e-06, "step_time_sec": 104.48 }, { "step": 965, "epoch": 0.5218684512945312, "wallclock": "2026-06-24T09:56:51.000492", "loss": 0.0685, "grad_norm": 0.7557464838027954, "learning_rate": 8.665401192598761e-06, "step_time_sec": 105.1 }, { "step": 970, "epoch": 0.5245724329074563, "wallclock": "2026-06-24T09:58:36.814098", "loss": 0.0646, "grad_norm": 0.8066175580024719, "learning_rate": 8.65047391735261e-06, "step_time_sec": 105.81 }, { "step": 975, "epoch": 0.5272764145203812, "wallclock": "2026-06-24T10:00:21.726492", "loss": 0.0703, "grad_norm": 1.2292455434799194, "learning_rate": 8.635476637671197e-06, "step_time_sec": 104.91 }, { "step": 980, "epoch": 0.5299803961333063, "wallclock": "2026-06-24T10:02:07.309600", "loss": 0.0722, "grad_norm": 0.7355031967163086, "learning_rate": 8.620409641154465e-06, "step_time_sec": 105.58 }, { "step": 985, "epoch": 0.5326843777462313, "wallclock": "2026-06-24T10:03:54.319283", "loss": 0.0593, "grad_norm": 0.9767148494720459, "learning_rate": 8.605273216739307e-06, "step_time_sec": 107.01 }, { "step": 990, "epoch": 0.5353883593591564, "wallclock": "2026-06-24T10:05:40.086818", "loss": 0.0597, "grad_norm": 0.6078879237174988, "learning_rate": 8.590067654694017e-06, "step_time_sec": 105.77 }, { "step": 995, "epoch": 0.5380923409720814, "wallclock": "2026-06-24T10:07:24.909120", "loss": 0.0819, "grad_norm": 0.5737846493721008, "learning_rate": 8.574793246612727e-06, "step_time_sec": 104.82 }, { "step": 1000, "epoch": 0.5407963225850064, "wallclock": "2026-06-24T10:09:11.121871", "loss": 0.0704, "grad_norm": 0.5743271708488464, "learning_rate": 8.559450285409825e-06, "step_time_sec": 106.21, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1000, "epoch": 0.5407963225850064, "wallclock": "2026-06-24T10:10:51.453842", "eval_loss": 0.07477952539920807, "eval_runtime": 100.3276, "eval_samples_per_second": 4.984, "eval_steps_per_second": 1.246, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1005, "epoch": 0.5435003041979315, "wallclock": "2026-06-24T10:13:58.794483", "loss": 0.0445, "grad_norm": 0.3843560218811035, "learning_rate": 8.544039065314317e-06, "step_time_sec": 287.67 }, { "step": 1010, "epoch": 0.5462042858108564, "wallclock": "2026-06-24T10:15:44.821711", "loss": 0.0824, "grad_norm": 0.788098156452179, "learning_rate": 8.528559881864209e-06, "step_time_sec": 106.03 }, { "step": 1015, "epoch": 0.5489082674237815, "wallclock": "2026-06-24T10:17:30.511914", "loss": 0.0593, "grad_norm": 0.5601520538330078, "learning_rate": 8.513013031900814e-06, "step_time_sec": 105.69 }, { "step": 1020, "epoch": 0.5516122490367066, "wallclock": "2026-06-24T10:19:15.717942", "loss": 0.0578, "grad_norm": 0.4331408739089966, "learning_rate": 8.497398813563086e-06, "step_time_sec": 105.21 }, { "step": 1025, "epoch": 0.5543162306496315, "wallclock": "2026-06-24T10:21:00.311987", "loss": 0.0772, "grad_norm": 0.7381686568260193, "learning_rate": 8.48171752628188e-06, "step_time_sec": 104.59 }, { "step": 1030, "epoch": 0.5570202122625566, "wallclock": "2026-06-24T10:22:46.121305", "loss": 0.052, "grad_norm": 0.7812600135803223, "learning_rate": 8.46596947077422e-06, "step_time_sec": 105.81 }, { "step": 1035, "epoch": 0.5597241938754817, "wallclock": "2026-06-24T10:24:31.783704", "loss": 0.0757, "grad_norm": 0.7333759069442749, "learning_rate": 8.450154949037539e-06, "step_time_sec": 105.66 }, { "step": 1040, "epoch": 0.5624281754884067, "wallclock": "2026-06-24T10:26:17.283088", "loss": 0.0588, "grad_norm": 0.7570787668228149, "learning_rate": 8.434274264343869e-06, "step_time_sec": 105.5 }, { "step": 1045, "epoch": 0.5651321571013317, "wallclock": "2026-06-24T10:28:03.723823", "loss": 0.0491, "grad_norm": 0.42195039987564087, "learning_rate": 8.418327721234044e-06, "step_time_sec": 106.44 }, { "step": 1050, "epoch": 0.5678361387142568, "wallclock": "2026-06-24T10:29:49.107217", "loss": 0.0524, "grad_norm": 0.5051612257957458, "learning_rate": 8.40231562551185e-06, "step_time_sec": 105.38, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1055, "epoch": 0.5705401203271818, "wallclock": "2026-06-24T10:31:34.010033", "loss": 0.0582, "grad_norm": 0.6454283595085144, "learning_rate": 8.386238284238163e-06, "step_time_sec": 104.9 }, { "step": 1060, "epoch": 0.5732441019401068, "wallclock": "2026-06-24T10:33:20.785609", "loss": 0.0835, "grad_norm": 0.9067649841308594, "learning_rate": 8.37009600572506e-06, "step_time_sec": 106.78 }, { "step": 1065, "epoch": 0.5759480835530318, "wallclock": "2026-06-24T10:35:06.993088", "loss": 0.0608, "grad_norm": 0.5329049825668335, "learning_rate": 8.35388909952991e-06, "step_time_sec": 106.21 }, { "step": 1070, "epoch": 0.5786520651659569, "wallclock": "2026-06-24T10:36:51.613853", "loss": 0.0732, "grad_norm": 1.0739482641220093, "learning_rate": 8.337617876449427e-06, "step_time_sec": 104.62 }, { "step": 1075, "epoch": 0.5813560467788819, "wallclock": "2026-06-24T10:38:38.500861", "loss": 0.0712, "grad_norm": 0.5759013295173645, "learning_rate": 8.321282648513727e-06, "step_time_sec": 106.89 }, { "step": 1080, "epoch": 0.5840600283918069, "wallclock": "2026-06-24T10:40:24.115300", "loss": 0.06, "grad_norm": 0.5534053444862366, "learning_rate": 8.304883728980325e-06, "step_time_sec": 105.61 }, { "step": 1085, "epoch": 0.586764010004732, "wallclock": "2026-06-24T10:42:09.998730", "loss": 0.0613, "grad_norm": 0.7383453845977783, "learning_rate": 8.288421432328146e-06, "step_time_sec": 105.88 }, { "step": 1090, "epoch": 0.589467991617657, "wallclock": "2026-06-24T10:43:56.874611", "loss": 0.0611, "grad_norm": 0.5191856026649475, "learning_rate": 8.271896074251483e-06, "step_time_sec": 106.88 }, { "step": 1095, "epoch": 0.592171973230582, "wallclock": "2026-06-24T10:45:41.729013", "loss": 0.0571, "grad_norm": 0.5893298983573914, "learning_rate": 8.255307971653941e-06, "step_time_sec": 104.85 }, { "step": 1100, "epoch": 0.5948759548435071, "wallclock": "2026-06-24T10:47:28.787718", "loss": 0.0579, "grad_norm": 0.6628295183181763, "learning_rate": 8.238657442642375e-06, "step_time_sec": 107.06, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1100, "epoch": 0.5948759548435071, "wallclock": "2026-06-24T10:49:09.237050", "eval_loss": 0.07260795682668686, "eval_runtime": 100.4446, "eval_samples_per_second": 4.978, "eval_steps_per_second": 1.244, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1105, "epoch": 0.5975799364564321, "wallclock": "2026-06-24T10:52:18.585949", "loss": 0.0701, "grad_norm": 0.9406186938285828, "learning_rate": 8.221944806520768e-06, "step_time_sec": 289.8 }, { "step": 1110, "epoch": 0.6002839180693571, "wallclock": "2026-06-24T10:54:03.822666", "loss": 0.0709, "grad_norm": 0.6916760206222534, "learning_rate": 8.205170383784125e-06, "step_time_sec": 105.24 }, { "step": 1115, "epoch": 0.6029878996822822, "wallclock": "2026-06-24T10:55:49.426292", "loss": 0.0591, "grad_norm": 0.4082253575325012, "learning_rate": 8.188334496112322e-06, "step_time_sec": 105.6 }, { "step": 1120, "epoch": 0.6056918812952072, "wallclock": "2026-06-24T10:57:36.587027", "loss": 0.053, "grad_norm": 0.5415107011795044, "learning_rate": 8.171437466363934e-06, "step_time_sec": 107.16 }, { "step": 1125, "epoch": 0.6083958629081322, "wallclock": "2026-06-24T10:59:22.605930", "loss": 0.0612, "grad_norm": 0.5975248217582703, "learning_rate": 8.154479618570046e-06, "step_time_sec": 106.02 }, { "step": 1130, "epoch": 0.6110998445210573, "wallclock": "2026-06-24T11:01:07.592225", "loss": 0.0632, "grad_norm": 0.36586880683898926, "learning_rate": 8.137461277928039e-06, "step_time_sec": 104.99 }, { "step": 1135, "epoch": 0.6138038261339823, "wallclock": "2026-06-24T11:02:52.501190", "loss": 0.0638, "grad_norm": 0.6821796298027039, "learning_rate": 8.120382770795354e-06, "step_time_sec": 104.91 }, { "step": 1140, "epoch": 0.6165078077469073, "wallclock": "2026-06-24T11:04:37.321268", "loss": 0.0672, "grad_norm": 0.7406355142593384, "learning_rate": 8.103244424683232e-06, "step_time_sec": 104.82 }, { "step": 1145, "epoch": 0.6192117893598323, "wallclock": "2026-06-24T11:06:23.997767", "loss": 0.0551, "grad_norm": 0.6757558584213257, "learning_rate": 8.086046568250438e-06, "step_time_sec": 106.68 }, { "step": 1150, "epoch": 0.6219157709727574, "wallclock": "2026-06-24T11:08:10.298796", "loss": 0.0585, "grad_norm": 0.6179367899894714, "learning_rate": 8.06878953129695e-06, "step_time_sec": 106.3, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1155, "epoch": 0.6246197525856824, "wallclock": "2026-06-24T11:09:56.115384", "loss": 0.0607, "grad_norm": 0.9675397872924805, "learning_rate": 8.051473644757644e-06, "step_time_sec": 105.82 }, { "step": 1160, "epoch": 0.6273237341986074, "wallclock": "2026-06-24T11:11:41.181597", "loss": 0.0779, "grad_norm": 0.6827834248542786, "learning_rate": 8.034099240695942e-06, "step_time_sec": 105.07 }, { "step": 1165, "epoch": 0.6300277158115325, "wallclock": "2026-06-24T11:13:27.479419", "loss": 0.0539, "grad_norm": 0.43536603450775146, "learning_rate": 8.016666652297443e-06, "step_time_sec": 106.3 }, { "step": 1170, "epoch": 0.6327316974244576, "wallclock": "2026-06-24T11:15:14.490172", "loss": 0.0508, "grad_norm": 0.5545168519020081, "learning_rate": 7.999176213863536e-06, "step_time_sec": 107.01 }, { "step": 1175, "epoch": 0.6354356790373825, "wallclock": "2026-06-24T11:17:00.009949", "loss": 0.0414, "grad_norm": 0.43939608335494995, "learning_rate": 7.981628260804992e-06, "step_time_sec": 105.52 }, { "step": 1180, "epoch": 0.6381396606503076, "wallclock": "2026-06-24T11:18:49.614174", "loss": 0.0585, "grad_norm": 0.7514466047286987, "learning_rate": 7.964023129635528e-06, "step_time_sec": 109.6 }, { "step": 1185, "epoch": 0.6408436422632326, "wallclock": "2026-06-24T11:20:35.886257", "loss": 0.0742, "grad_norm": 1.2430953979492188, "learning_rate": 7.946361157965354e-06, "step_time_sec": 106.27 }, { "step": 1190, "epoch": 0.6435476238761576, "wallclock": "2026-06-24T11:22:21.402749", "loss": 0.0556, "grad_norm": 0.6524196863174438, "learning_rate": 7.928642684494696e-06, "step_time_sec": 105.52 }, { "step": 1195, "epoch": 0.6462516054890827, "wallclock": "2026-06-24T11:24:07.831963", "loss": 0.066, "grad_norm": 0.7481945157051086, "learning_rate": 7.910868049007312e-06, "step_time_sec": 106.43 }, { "step": 1200, "epoch": 0.6489555871020077, "wallclock": "2026-06-24T11:25:53.724397", "loss": 0.0712, "grad_norm": 0.9145833849906921, "learning_rate": 7.893037592363959e-06, "step_time_sec": 105.89, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1200, "epoch": 0.6489555871020077, "wallclock": "2026-06-24T11:27:34.112458", "eval_loss": 0.0755784884095192, "eval_runtime": 100.3831, "eval_samples_per_second": 4.981, "eval_steps_per_second": 1.245, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1205, "epoch": 0.6516595687149327, "wallclock": "2026-06-24T11:30:41.010276", "loss": 0.0515, "grad_norm": 0.7790878415107727, "learning_rate": 7.875151656495874e-06, "step_time_sec": 287.29 }, { "step": 1210, "epoch": 0.6543635503278578, "wallclock": "2026-06-24T11:32:27.481096", "loss": 0.0566, "grad_norm": 0.6310415267944336, "learning_rate": 7.8572105843982e-06, "step_time_sec": 106.47 }, { "step": 1215, "epoch": 0.6570675319407828, "wallclock": "2026-06-24T11:34:13.893600", "loss": 0.0451, "grad_norm": 0.5569303631782532, "learning_rate": 7.839214720123427e-06, "step_time_sec": 106.41 }, { "step": 1220, "epoch": 0.6597715135537079, "wallclock": "2026-06-24T11:35:58.515537", "loss": 0.0688, "grad_norm": 0.636441171169281, "learning_rate": 7.821164408774772e-06, "step_time_sec": 104.62 }, { "step": 1225, "epoch": 0.6624754951666328, "wallclock": "2026-06-24T11:37:42.397948", "loss": 0.0697, "grad_norm": 0.7517639398574829, "learning_rate": 7.803059996499584e-06, "step_time_sec": 103.88 }, { "step": 1230, "epoch": 0.6651794767795579, "wallclock": "2026-06-24T11:39:28.808346", "loss": 0.0575, "grad_norm": 0.5596706867218018, "learning_rate": 7.78490183048269e-06, "step_time_sec": 106.41 }, { "step": 1235, "epoch": 0.667883458392483, "wallclock": "2026-06-24T11:41:14.200078", "loss": 0.0586, "grad_norm": 0.645969033241272, "learning_rate": 7.76669025893974e-06, "step_time_sec": 105.39 }, { "step": 1240, "epoch": 0.6705874400054079, "wallclock": "2026-06-24T11:43:00.685648", "loss": 0.0658, "grad_norm": 0.7119715213775635, "learning_rate": 7.748425631110536e-06, "step_time_sec": 106.49 }, { "step": 1245, "epoch": 0.673291421618333, "wallclock": "2026-06-24T11:44:47.803121", "loss": 0.07, "grad_norm": 1.2201249599456787, "learning_rate": 7.730108297252328e-06, "step_time_sec": 107.12 }, { "step": 1250, "epoch": 0.6759954032312581, "wallclock": "2026-06-24T11:46:32.404383", "loss": 0.0473, "grad_norm": 0.7548292875289917, "learning_rate": 7.7117386086331e-06, "step_time_sec": 104.6, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1255, "epoch": 0.678699384844183, "wallclock": "2026-06-24T11:48:18.393499", "loss": 0.0855, "grad_norm": 1.1629971265792847, "learning_rate": 7.693316917524832e-06, "step_time_sec": 105.99 }, { "step": 1260, "epoch": 0.6814033664571081, "wallclock": "2026-06-24T11:50:04.895492", "loss": 0.0511, "grad_norm": 0.798232913017273, "learning_rate": 7.674843577196747e-06, "step_time_sec": 106.5 }, { "step": 1265, "epoch": 0.6841073480700331, "wallclock": "2026-06-24T11:51:50.622300", "loss": 0.055, "grad_norm": 0.5960519909858704, "learning_rate": 7.656318941908534e-06, "step_time_sec": 105.73 }, { "step": 1270, "epoch": 0.6868113296829581, "wallclock": "2026-06-24T11:53:37.318998", "loss": 0.069, "grad_norm": 0.8142486810684204, "learning_rate": 7.637743366903559e-06, "step_time_sec": 106.7 }, { "step": 1275, "epoch": 0.6895153112958832, "wallclock": "2026-06-24T11:55:24.097132", "loss": 0.0486, "grad_norm": 0.6205362677574158, "learning_rate": 7.61911720840204e-06, "step_time_sec": 106.78 }, { "step": 1280, "epoch": 0.6922192929088082, "wallclock": "2026-06-24T11:57:09.718166", "loss": 0.0653, "grad_norm": 1.1235874891281128, "learning_rate": 7.60044082359424e-06, "step_time_sec": 105.62 }, { "step": 1285, "epoch": 0.6949232745217333, "wallclock": "2026-06-24T11:58:56.721018", "loss": 0.078, "grad_norm": 0.8355940580368042, "learning_rate": 7.581714570633586e-06, "step_time_sec": 107.0 }, { "step": 1290, "epoch": 0.6976272561346583, "wallclock": "2026-06-24T12:00:43.212505", "loss": 0.0592, "grad_norm": 2.8660950660705566, "learning_rate": 7.562938808629829e-06, "step_time_sec": 106.49 }, { "step": 1295, "epoch": 0.7003312377475833, "wallclock": "2026-06-24T12:02:28.492093", "loss": 0.058, "grad_norm": 0.8744626045227051, "learning_rate": 7.54411389764214e-06, "step_time_sec": 105.28 }, { "step": 1300, "epoch": 0.7030352193605084, "wallclock": "2026-06-24T12:04:14.101813", "loss": 0.0608, "grad_norm": 0.6016539931297302, "learning_rate": 7.52524019867221e-06, "step_time_sec": 105.61, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1300, "epoch": 0.7030352193605084, "wallclock": "2026-06-24T12:05:54.677349", "eval_loss": 0.07137385755777359, "eval_runtime": 100.5705, "eval_samples_per_second": 4.972, "eval_steps_per_second": 1.243, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1305, "epoch": 0.7057392009734333, "wallclock": "2026-06-24T12:08:54.483469", "loss": 0.0709, "grad_norm": 0.6734678149223328, "learning_rate": 7.506318073657331e-06, "step_time_sec": 280.38 }, { "step": 1310, "epoch": 0.7084431825863584, "wallclock": "2026-06-24T12:10:38.696936", "loss": 0.0623, "grad_norm": 0.7641857266426086, "learning_rate": 7.4873478854634476e-06, "step_time_sec": 104.21 }, { "step": 1315, "epoch": 0.7111471641992835, "wallclock": "2026-06-24T12:12:32.495225", "loss": 0.0587, "grad_norm": 0.8493006229400635, "learning_rate": 7.4683299978782076e-06, "step_time_sec": 113.8 }, { "step": 1320, "epoch": 0.7138511458122084, "wallclock": "2026-06-24T12:14:17.906835", "loss": 0.0474, "grad_norm": 0.4841386377811432, "learning_rate": 7.449264775603979e-06, "step_time_sec": 105.41 }, { "step": 1325, "epoch": 0.7165551274251335, "wallclock": "2026-06-24T12:16:05.319711", "loss": 0.0603, "grad_norm": 0.872616171836853, "learning_rate": 7.430152584250856e-06, "step_time_sec": 107.41 }, { "step": 1330, "epoch": 0.7192591090380586, "wallclock": "2026-06-24T12:17:51.521993", "loss": 0.0525, "grad_norm": 0.7304244041442871, "learning_rate": 7.410993790329652e-06, "step_time_sec": 106.2 }, { "step": 1335, "epoch": 0.7219630906509835, "wallclock": "2026-06-24T12:19:38.816730", "loss": 0.0441, "grad_norm": 0.5004603266716003, "learning_rate": 7.3917887612448665e-06, "step_time_sec": 107.29 }, { "step": 1340, "epoch": 0.7246670722639086, "wallclock": "2026-06-24T12:21:24.716857", "loss": 0.0669, "grad_norm": 0.6454601287841797, "learning_rate": 7.372537865287648e-06, "step_time_sec": 105.9 }, { "step": 1345, "epoch": 0.7273710538768337, "wallclock": "2026-06-24T12:23:10.411567", "loss": 0.0422, "grad_norm": 0.9636154174804688, "learning_rate": 7.353241471628716e-06, "step_time_sec": 105.69 }, { "step": 1350, "epoch": 0.7300750354897587, "wallclock": "2026-06-24T12:24:56.017994", "loss": 0.0456, "grad_norm": 0.6495915651321411, "learning_rate": 7.3338999503112975e-06, "step_time_sec": 105.61, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1355, "epoch": 0.7327790171026837, "wallclock": "2026-06-24T12:26:41.498254", "loss": 0.0547, "grad_norm": 0.8502314686775208, "learning_rate": 7.314513672244021e-06, "step_time_sec": 105.48 }, { "step": 1360, "epoch": 0.7354829987156087, "wallclock": "2026-06-24T12:28:28.488384", "loss": 0.0607, "grad_norm": 0.5915205478668213, "learning_rate": 7.295083009193808e-06, "step_time_sec": 106.99 }, { "step": 1365, "epoch": 0.7381869803285338, "wallclock": "2026-06-24T12:30:17.161947", "loss": 0.0654, "grad_norm": 0.7883327603340149, "learning_rate": 7.275608333778742e-06, "step_time_sec": 108.67 }, { "step": 1370, "epoch": 0.7408909619414588, "wallclock": "2026-06-24T12:32:05.107393", "loss": 0.0552, "grad_norm": 0.7381963133811951, "learning_rate": 7.256090019460922e-06, "step_time_sec": 107.95 }, { "step": 1375, "epoch": 0.7435949435543838, "wallclock": "2026-06-24T12:33:52.735316", "loss": 0.0649, "grad_norm": 0.8336455821990967, "learning_rate": 7.236528440539303e-06, "step_time_sec": 107.63 }, { "step": 1380, "epoch": 0.7462989251673089, "wallclock": "2026-06-24T12:35:40.395535", "loss": 0.0393, "grad_norm": 0.5212644338607788, "learning_rate": 7.2169239721425154e-06, "step_time_sec": 107.66 }, { "step": 1385, "epoch": 0.7490029067802338, "wallclock": "2026-06-24T12:37:27.680116", "loss": 0.0633, "grad_norm": 0.8368508815765381, "learning_rate": 7.197276990221677e-06, "step_time_sec": 107.28 }, { "step": 1390, "epoch": 0.7517068883931589, "wallclock": "2026-06-24T12:39:14.615045", "loss": 0.0497, "grad_norm": 0.7919797897338867, "learning_rate": 7.177587871543172e-06, "step_time_sec": 106.93 }, { "step": 1395, "epoch": 0.754410870006084, "wallclock": "2026-06-24T12:41:03.202011", "loss": 0.0564, "grad_norm": 0.8120989799499512, "learning_rate": 7.157856993681442e-06, "step_time_sec": 108.59 }, { "step": 1400, "epoch": 0.757114851619009, "wallclock": "2026-06-24T12:42:50.343742", "loss": 0.0647, "grad_norm": 0.6419529318809509, "learning_rate": 7.138084735011727e-06, "step_time_sec": 107.14, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1400, "epoch": 0.757114851619009, "wallclock": "2026-06-24T12:44:31.432549", "eval_loss": 0.07097452133893967, "eval_runtime": 101.0814, "eval_samples_per_second": 4.947, "eval_steps_per_second": 1.237, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1405, "epoch": 0.759818833231934, "wallclock": "2026-06-24T12:47:46.178602", "loss": 0.0585, "grad_norm": 0.5049331188201904, "learning_rate": 7.118271474702828e-06, "step_time_sec": 295.83 }, { "step": 1410, "epoch": 0.7625228148448591, "wallclock": "2026-06-24T12:49:34.393720", "loss": 0.0597, "grad_norm": 0.8283151984214783, "learning_rate": 7.098417592709819e-06, "step_time_sec": 108.22 }, { "step": 1415, "epoch": 0.7652267964577841, "wallclock": "2026-06-24T12:51:22.319403", "loss": 0.052, "grad_norm": 0.6273178458213806, "learning_rate": 7.078523469766772e-06, "step_time_sec": 107.93 }, { "step": 1420, "epoch": 0.7679307780707091, "wallclock": "2026-06-24T12:53:09.791827", "loss": 0.0504, "grad_norm": 0.6756861805915833, "learning_rate": 7.0585894873794514e-06, "step_time_sec": 107.47 }, { "step": 1425, "epoch": 0.7706347596836342, "wallclock": "2026-06-24T12:54:57.620245", "loss": 0.0341, "grad_norm": 0.5247818231582642, "learning_rate": 7.038616027817998e-06, "step_time_sec": 107.83 }, { "step": 1430, "epoch": 0.7733387412965592, "wallclock": "2026-06-24T12:56:43.618402", "loss": 0.0617, "grad_norm": 0.5578892230987549, "learning_rate": 7.018603474109601e-06, "step_time_sec": 106.0 }, { "step": 1435, "epoch": 0.7760427229094842, "wallclock": "2026-06-24T12:58:29.201049", "loss": 0.0443, "grad_norm": 0.8692203760147095, "learning_rate": 6.9985522100311465e-06, "step_time_sec": 105.58 }, { "step": 1440, "epoch": 0.7787467045224092, "wallclock": "2026-06-24T13:00:16.509392", "loss": 0.0692, "grad_norm": 0.6393124461174011, "learning_rate": 6.978462620101865e-06, "step_time_sec": 107.31 }, { "step": 1445, "epoch": 0.7814506861353343, "wallclock": "2026-06-24T13:02:04.488002", "loss": 0.0524, "grad_norm": 0.66062992811203, "learning_rate": 6.958335089575952e-06, "step_time_sec": 107.98 }, { "step": 1450, "epoch": 0.7841546677482593, "wallclock": "2026-06-24T13:03:51.464399", "loss": 0.0606, "grad_norm": 0.3925676643848419, "learning_rate": 6.938170004435186e-06, "step_time_sec": 106.98, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1455, "epoch": 0.7868586493611843, "wallclock": "2026-06-24T13:05:39.613199", "loss": 0.0539, "grad_norm": 0.7175688147544861, "learning_rate": 6.91796775138152e-06, "step_time_sec": 108.15 }, { "step": 1460, "epoch": 0.7895626309741094, "wallclock": "2026-06-24T13:07:26.686179", "loss": 0.0571, "grad_norm": 0.6691136360168457, "learning_rate": 6.89772871782967e-06, "step_time_sec": 107.07 }, { "step": 1465, "epoch": 0.7922666125870345, "wallclock": "2026-06-24T13:09:14.110537", "loss": 0.0749, "grad_norm": 0.963224470615387, "learning_rate": 6.877453291899685e-06, "step_time_sec": 107.42 }, { "step": 1470, "epoch": 0.7949705941999594, "wallclock": "2026-06-24T13:11:01.097029", "loss": 0.0665, "grad_norm": 0.702336311340332, "learning_rate": 6.857141862409504e-06, "step_time_sec": 106.99 }, { "step": 1475, "epoch": 0.7976745758128845, "wallclock": "2026-06-24T13:12:48.608856", "loss": 0.0502, "grad_norm": 0.5416118502616882, "learning_rate": 6.836794818867496e-06, "step_time_sec": 107.51 }, { "step": 1480, "epoch": 0.8003785574258095, "wallclock": "2026-06-24T13:14:36.033474", "loss": 0.0441, "grad_norm": 0.5691907405853271, "learning_rate": 6.816412551464999e-06, "step_time_sec": 107.42 }, { "step": 1485, "epoch": 0.8030825390387345, "wallclock": "2026-06-24T13:16:24.108145", "loss": 0.0626, "grad_norm": 0.6911583542823792, "learning_rate": 6.795995451068828e-06, "step_time_sec": 108.07 }, { "step": 1490, "epoch": 0.8057865206516596, "wallclock": "2026-06-24T13:18:10.811010", "loss": 0.0563, "grad_norm": 1.3713301420211792, "learning_rate": 6.775543909213786e-06, "step_time_sec": 106.7 }, { "step": 1495, "epoch": 0.8084905022645846, "wallclock": "2026-06-24T13:19:58.721530", "loss": 0.0483, "grad_norm": 0.7632337212562561, "learning_rate": 6.755058318095151e-06, "step_time_sec": 107.91 }, { "step": 1500, "epoch": 0.8111944838775096, "wallclock": "2026-06-24T13:21:45.388432", "loss": 0.0697, "grad_norm": 1.1038848161697388, "learning_rate": 6.73453907056116e-06, "step_time_sec": 106.67, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1500, "epoch": 0.8111944838775096, "wallclock": "2026-06-24T13:23:26.179575", "eval_loss": 0.0741763636469841, "eval_runtime": 100.7848, "eval_samples_per_second": 4.961, "eval_steps_per_second": 1.24, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1505, "epoch": 0.8138984654904347, "wallclock": "2026-06-24T13:26:37.503409", "loss": 0.0618, "grad_norm": 0.8092767000198364, "learning_rate": 6.71398656010547e-06, "step_time_sec": 292.11 }, { "step": 1510, "epoch": 0.8166024471033597, "wallclock": "2026-06-24T13:28:25.707697", "loss": 0.0432, "grad_norm": 0.6367549300193787, "learning_rate": 6.693401180859618e-06, "step_time_sec": 108.2 }, { "step": 1515, "epoch": 0.8193064287162848, "wallclock": "2026-06-24T13:30:13.512145", "loss": 0.0434, "grad_norm": 0.7922583222389221, "learning_rate": 6.672783327585454e-06, "step_time_sec": 107.8 }, { "step": 1520, "epoch": 0.8220104103292097, "wallclock": "2026-06-24T13:32:01.383536", "loss": 0.061, "grad_norm": 0.7766749858856201, "learning_rate": 6.65213339566758e-06, "step_time_sec": 107.87 }, { "step": 1525, "epoch": 0.8247143919421348, "wallclock": "2026-06-24T13:33:48.783229", "loss": 0.0369, "grad_norm": 0.5121834874153137, "learning_rate": 6.631451781105767e-06, "step_time_sec": 107.4 }, { "step": 1530, "epoch": 0.8274183735550599, "wallclock": "2026-06-24T13:35:36.299372", "loss": 0.0582, "grad_norm": 0.726270318031311, "learning_rate": 6.6107388805073495e-06, "step_time_sec": 107.52 }, { "step": 1535, "epoch": 0.8301223551679848, "wallclock": "2026-06-24T13:37:23.706841", "loss": 0.0468, "grad_norm": 0.6746184825897217, "learning_rate": 6.589995091079636e-06, "step_time_sec": 107.41 }, { "step": 1540, "epoch": 0.8328263367809099, "wallclock": "2026-06-24T13:39:10.512639", "loss": 0.0595, "grad_norm": 0.8106797337532043, "learning_rate": 6.569220810622281e-06, "step_time_sec": 106.81 }, { "step": 1545, "epoch": 0.835530318393835, "wallclock": "2026-06-24T13:40:56.792388", "loss": 0.0539, "grad_norm": 0.7323052287101746, "learning_rate": 6.548416437519658e-06, "step_time_sec": 106.28 }, { "step": 1550, "epoch": 0.8382343000067599, "wallclock": "2026-06-24T13:42:44.227103", "loss": 0.0491, "grad_norm": 0.6671241521835327, "learning_rate": 6.5275823707332275e-06, "step_time_sec": 107.43, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1555, "epoch": 0.840938281619685, "wallclock": "2026-06-24T13:44:32.006313", "loss": 0.0463, "grad_norm": 1.1120103597640991, "learning_rate": 6.50671900979387e-06, "step_time_sec": 107.78 }, { "step": 1560, "epoch": 0.84364226323261, "wallclock": "2026-06-24T13:46:19.183463", "loss": 0.0542, "grad_norm": 0.3518182039260864, "learning_rate": 6.485826754794245e-06, "step_time_sec": 107.18 }, { "step": 1565, "epoch": 0.846346244845535, "wallclock": "2026-06-24T13:48:06.385481", "loss": 0.0379, "grad_norm": 0.7423526644706726, "learning_rate": 6.464906006381101e-06, "step_time_sec": 107.2 }, { "step": 1570, "epoch": 0.8490502264584601, "wallclock": "2026-06-24T13:49:54.323330", "loss": 0.0477, "grad_norm": 0.7195249795913696, "learning_rate": 6.443957165747601e-06, "step_time_sec": 107.94 }, { "step": 1575, "epoch": 0.8517542080713851, "wallclock": "2026-06-24T13:51:40.924711", "loss": 0.058, "grad_norm": 1.0430902242660522, "learning_rate": 6.422980634625627e-06, "step_time_sec": 106.6 }, { "step": 1580, "epoch": 0.8544581896843102, "wallclock": "2026-06-24T13:53:26.389981", "loss": 0.054, "grad_norm": 0.8965272903442383, "learning_rate": 6.4019768152780785e-06, "step_time_sec": 105.47 }, { "step": 1585, "epoch": 0.8571621712972352, "wallclock": "2026-06-24T13:55:13.357681", "loss": 0.0538, "grad_norm": 0.9105026125907898, "learning_rate": 6.380946110491151e-06, "step_time_sec": 106.97 }, { "step": 1590, "epoch": 0.8598661529101602, "wallclock": "2026-06-24T13:57:00.092764", "loss": 0.0405, "grad_norm": 0.7773502469062805, "learning_rate": 6.359888923566621e-06, "step_time_sec": 106.74 }, { "step": 1595, "epoch": 0.8625701345230853, "wallclock": "2026-06-24T13:58:47.207895", "loss": 0.0522, "grad_norm": 1.0928678512573242, "learning_rate": 6.338805658314106e-06, "step_time_sec": 107.12 }, { "step": 1600, "epoch": 0.8652741161360102, "wallclock": "2026-06-24T14:00:34.321798", "loss": 0.0346, "grad_norm": 0.37700727581977844, "learning_rate": 6.317696719043327e-06, "step_time_sec": 107.11, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1600, "epoch": 0.8652741161360102, "wallclock": "2026-06-24T14:02:14.979680", "eval_loss": 0.07683192193508148, "eval_runtime": 100.6515, "eval_samples_per_second": 4.968, "eval_steps_per_second": 1.242, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1605, "epoch": 0.8679780977489353, "wallclock": "2026-06-24T14:05:26.609530", "loss": 0.0504, "grad_norm": 0.8107773065567017, "learning_rate": 6.2965625105563445e-06, "step_time_sec": 292.29 }, { "step": 1610, "epoch": 0.8706820793618604, "wallclock": "2026-06-24T14:07:13.558796", "loss": 0.0366, "grad_norm": 1.109079122543335, "learning_rate": 6.275403438139801e-06, "step_time_sec": 106.95 }, { "step": 1615, "epoch": 0.8733860609747853, "wallclock": "2026-06-24T14:09:00.902964", "loss": 0.0516, "grad_norm": 0.990442156791687, "learning_rate": 6.254219907557159e-06, "step_time_sec": 107.34 }, { "step": 1620, "epoch": 0.8760900425877104, "wallclock": "2026-06-24T14:10:48.617807", "loss": 0.0418, "grad_norm": 0.7781974077224731, "learning_rate": 6.2330123250409e-06, "step_time_sec": 107.71 }, { "step": 1625, "epoch": 0.8787940242006355, "wallclock": "2026-06-24T14:12:35.112434", "loss": 0.0574, "grad_norm": 1.2163763046264648, "learning_rate": 6.211781097284754e-06, "step_time_sec": 106.49 }, { "step": 1630, "epoch": 0.8814980058135604, "wallclock": "2026-06-24T14:14:21.209253", "loss": 0.0626, "grad_norm": 0.9669123291969299, "learning_rate": 6.190526631435882e-06, "step_time_sec": 106.1 }, { "step": 1635, "epoch": 0.8842019874264855, "wallclock": "2026-06-24T14:16:08.902128", "loss": 0.04, "grad_norm": 1.140141487121582, "learning_rate": 6.169249335087085e-06, "step_time_sec": 107.69 }, { "step": 1640, "epoch": 0.8869059690394105, "wallclock": "2026-06-24T14:17:56.627117", "loss": 0.0813, "grad_norm": 1.00438392162323, "learning_rate": 6.1479496162689775e-06, "step_time_sec": 107.72 }, { "step": 1645, "epoch": 0.8896099506523356, "wallclock": "2026-06-24T14:19:43.930288", "loss": 0.051, "grad_norm": 1.1830681562423706, "learning_rate": 6.1266278834421634e-06, "step_time_sec": 107.3 }, { "step": 1650, "epoch": 0.8923139322652606, "wallclock": "2026-06-24T14:21:30.620817", "loss": 0.048, "grad_norm": 0.7539001107215881, "learning_rate": 6.105284545489408e-06, "step_time_sec": 106.69, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1655, "epoch": 0.8950179138781856, "wallclock": "2026-06-24T14:23:17.090603", "loss": 0.044, "grad_norm": 0.8819478750228882, "learning_rate": 6.083920011707794e-06, "step_time_sec": 106.47 }, { "step": 1660, "epoch": 0.8977218954911107, "wallclock": "2026-06-24T14:25:03.986530", "loss": 0.0369, "grad_norm": 0.6605198383331299, "learning_rate": 6.062534691800865e-06, "step_time_sec": 106.9 }, { "step": 1665, "epoch": 0.9004258771040357, "wallclock": "2026-06-24T14:26:50.818521", "loss": 0.0434, "grad_norm": 0.5586560368537903, "learning_rate": 6.04112899587079e-06, "step_time_sec": 106.83 }, { "step": 1670, "epoch": 0.9031298587169607, "wallclock": "2026-06-24T14:28:38.292626", "loss": 0.0537, "grad_norm": 0.6612546443939209, "learning_rate": 6.019703334410473e-06, "step_time_sec": 107.47 }, { "step": 1675, "epoch": 0.9058338403298858, "wallclock": "2026-06-24T14:30:25.839650", "loss": 0.045, "grad_norm": 0.8835639357566833, "learning_rate": 5.998258118295699e-06, "step_time_sec": 107.55 }, { "step": 1680, "epoch": 0.9085378219428107, "wallclock": "2026-06-24T14:32:15.497667", "loss": 0.0351, "grad_norm": 0.7877563834190369, "learning_rate": 5.9767937587772464e-06, "step_time_sec": 109.66 }, { "step": 1685, "epoch": 0.9112418035557358, "wallclock": "2026-06-24T14:34:02.801842", "loss": 0.0423, "grad_norm": 0.8421223759651184, "learning_rate": 5.955310667473003e-06, "step_time_sec": 107.3 }, { "step": 1690, "epoch": 0.9139457851686609, "wallclock": "2026-06-24T14:35:49.804897", "loss": 0.0549, "grad_norm": 0.9553209543228149, "learning_rate": 5.933809256360076e-06, "step_time_sec": 107.0 }, { "step": 1695, "epoch": 0.9166497667815859, "wallclock": "2026-06-24T14:37:39.298110", "loss": 0.0365, "grad_norm": 0.9886178374290466, "learning_rate": 5.912289937766882e-06, "step_time_sec": 109.49 }, { "step": 1700, "epoch": 0.9193537483945109, "wallclock": "2026-06-24T14:39:25.987650", "loss": 0.0488, "grad_norm": 0.7625762820243835, "learning_rate": 5.890753124365252e-06, "step_time_sec": 106.69, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1700, "epoch": 0.9193537483945109, "wallclock": "2026-06-24T14:41:06.647821", "eval_loss": 0.0766952782869339, "eval_runtime": 100.6532, "eval_samples_per_second": 4.968, "eval_steps_per_second": 1.242, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1700, "epoch": 0.9193537483945109, "wallclock": "2026-06-24T14:42:31.431895", "train_runtime": 39166.0066, "train_samples_per_second": 3.021, "train_steps_per_second": 0.094, "total_flos": 5114610608766976.0, "train_loss": 0.08617227443877389, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1700, "epoch": 0.9193537483945109, "wallclock": "2026-06-24T14:44:29.019553", "eval_loss": 0.07097452133893967, "eval_runtime": 100.1105, "eval_samples_per_second": 4.994, "eval_steps_per_second": 1.249, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 80.99 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] } ]