NYXMed-V18-Model / training_metrics.json
vineetdaniels's picture
Model save
e87372d verified
Raw
History Blame Contribute Delete
107 kB
[
{
"step": 1,
"epoch": 0.0005407963225850064,
"wallclock": "2026-06-24T03:50:12.865305",
"loss": 0.4579,
"grad_norm": 2.3048150539398193,
"learning_rate": 9.00900900900901e-08
},
{
"step": 5,
"epoch": 0.002703981612925032,
"wallclock": "2026-06-24T03:51:38.176434",
"loss": 0.3614,
"grad_norm": 1.8860491514205933,
"learning_rate": 4.504504504504505e-07,
"step_time_sec": 85.31
},
{
"step": 10,
"epoch": 0.005407963225850064,
"wallclock": "2026-06-24T03:53:23.303779",
"loss": 0.4862,
"grad_norm": 2.008655548095703,
"learning_rate": 9.00900900900901e-07,
"step_time_sec": 105.13
},
{
"step": 15,
"epoch": 0.008111944838775096,
"wallclock": "2026-06-24T03:55:09.377081",
"loss": 0.4649,
"grad_norm": 2.2208123207092285,
"learning_rate": 1.3513513513513515e-06,
"step_time_sec": 106.07
},
{
"step": 20,
"epoch": 0.010815926451700129,
"wallclock": "2026-06-24T03:56:56.093355",
"loss": 0.4136,
"grad_norm": 1.8327311277389526,
"learning_rate": 1.801801801801802e-06,
"step_time_sec": 106.72
},
{
"step": 25,
"epoch": 0.01351990806462516,
"wallclock": "2026-06-24T03:58:40.595516",
"loss": 0.4941,
"grad_norm": 2.889826536178589,
"learning_rate": 2.2522522522522524e-06,
"step_time_sec": 104.5
},
{
"step": 30,
"epoch": 0.01622388967755019,
"wallclock": "2026-06-24T04:00:26.401533",
"loss": 0.3599,
"grad_norm": 2.0262131690979004,
"learning_rate": 2.702702702702703e-06,
"step_time_sec": 105.81
},
{
"step": 35,
"epoch": 0.018927871290475226,
"wallclock": "2026-06-24T04:02:12.105552",
"loss": 0.2975,
"grad_norm": 1.627108097076416,
"learning_rate": 3.1531531531531532e-06,
"step_time_sec": 105.7
},
{
"step": 40,
"epoch": 0.021631852903400257,
"wallclock": "2026-06-24T04:03:56.186851",
"loss": 0.3563,
"grad_norm": 1.6855164766311646,
"learning_rate": 3.603603603603604e-06,
"step_time_sec": 104.08
},
{
"step": 45,
"epoch": 0.02433583451632529,
"wallclock": "2026-06-24T04:05:41.990511",
"loss": 0.3596,
"grad_norm": 1.661110520362854,
"learning_rate": 4.0540540540540545e-06,
"step_time_sec": 105.8
},
{
"step": 50,
"epoch": 0.02703981612925032,
"wallclock": "2026-06-24T04:07:27.997794",
"loss": 0.267,
"grad_norm": 1.2917487621307373,
"learning_rate": 4.504504504504505e-06,
"step_time_sec": 106.01,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 75.27
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 55,
"epoch": 0.029743797742175355,
"wallclock": "2026-06-24T04:09:12.604118",
"loss": 0.2226,
"grad_norm": 0.9745638370513916,
"learning_rate": 4.954954954954955e-06,
"step_time_sec": 104.61
},
{
"step": 60,
"epoch": 0.03244777935510038,
"wallclock": "2026-06-24T04:10:57.892162",
"loss": 0.1948,
"grad_norm": 1.2490293979644775,
"learning_rate": 5.405405405405406e-06,
"step_time_sec": 105.29
},
{
"step": 65,
"epoch": 0.03515176096802542,
"wallclock": "2026-06-24T04:12:44.688074",
"loss": 0.2015,
"grad_norm": 0.9993012547492981,
"learning_rate": 5.855855855855856e-06,
"step_time_sec": 106.8
},
{
"step": 70,
"epoch": 0.03785574258095045,
"wallclock": "2026-06-24T04:14:30.291858",
"loss": 0.2271,
"grad_norm": 1.3109948635101318,
"learning_rate": 6.3063063063063065e-06,
"step_time_sec": 105.6
},
{
"step": 75,
"epoch": 0.040559724193875484,
"wallclock": "2026-06-24T04:16:14.219008",
"loss": 0.157,
"grad_norm": 0.6500820517539978,
"learning_rate": 6.7567567567567575e-06,
"step_time_sec": 103.93
},
{
"step": 80,
"epoch": 0.043263705806800515,
"wallclock": "2026-06-24T04:17:59.413498",
"loss": 0.1579,
"grad_norm": 0.8443478345870972,
"learning_rate": 7.207207207207208e-06,
"step_time_sec": 105.19
},
{
"step": 85,
"epoch": 0.045967687419725546,
"wallclock": "2026-06-24T04:19:44.517567",
"loss": 0.1641,
"grad_norm": 0.9615593552589417,
"learning_rate": 7.657657657657658e-06,
"step_time_sec": 105.1
},
{
"step": 90,
"epoch": 0.04867166903265058,
"wallclock": "2026-06-24T04:21:28.592924",
"loss": 0.1288,
"grad_norm": 0.6482295989990234,
"learning_rate": 8.108108108108109e-06,
"step_time_sec": 104.08
},
{
"step": 95,
"epoch": 0.05137565064557561,
"wallclock": "2026-06-24T04:23:14.290954",
"loss": 0.136,
"grad_norm": 0.8641292452812195,
"learning_rate": 8.55855855855856e-06,
"step_time_sec": 105.7
},
{
"step": 100,
"epoch": 0.05407963225850064,
"wallclock": "2026-06-24T04:24:58.500503",
"loss": 0.1148,
"grad_norm": 0.7579247355461121,
"learning_rate": 9.00900900900901e-06,
"step_time_sec": 104.21,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 77.06
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 100,
"epoch": 0.05407963225850064,
"wallclock": "2026-06-24T04:26:38.179343",
"eval_loss": 0.09886857122182846,
"eval_runtime": 99.676,
"eval_samples_per_second": 5.016,
"eval_steps_per_second": 1.254,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 77.07
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 105,
"epoch": 0.05678361387142567,
"wallclock": "2026-06-24T04:29:47.276194",
"loss": 0.1357,
"grad_norm": 1.0714149475097656,
"learning_rate": 9.45945945945946e-06,
"step_time_sec": 288.78
},
{
"step": 110,
"epoch": 0.05948759548435071,
"wallclock": "2026-06-24T04:31:33.882914",
"loss": 0.1446,
"grad_norm": 0.9079675078392029,
"learning_rate": 9.90990990990991e-06,
"step_time_sec": 106.61
},
{
"step": 115,
"epoch": 0.06219157709727574,
"wallclock": "2026-06-24T04:33:20.601103",
"loss": 0.1169,
"grad_norm": 0.7082911729812622,
"learning_rate": 9.999969317090495e-06,
"step_time_sec": 106.72
},
{
"step": 120,
"epoch": 0.06489555871020077,
"wallclock": "2026-06-24T04:35:06.071476",
"loss": 0.1294,
"grad_norm": 0.8290165066719055,
"learning_rate": 9.99984466841603e-06,
"step_time_sec": 105.47
},
{
"step": 125,
"epoch": 0.0675995403231258,
"wallclock": "2026-06-24T04:36:51.012116",
"loss": 0.0967,
"grad_norm": 3.919275999069214,
"learning_rate": 9.999624138683289e-06,
"step_time_sec": 104.94
},
{
"step": 130,
"epoch": 0.07030352193605084,
"wallclock": "2026-06-24T04:38:36.077195",
"loss": 0.1226,
"grad_norm": 0.9278262853622437,
"learning_rate": 9.999307732121325e-06,
"step_time_sec": 105.07
},
{
"step": 135,
"epoch": 0.07300750354897587,
"wallclock": "2026-06-24T04:40:20.812264",
"loss": 0.1098,
"grad_norm": 0.7507790923118591,
"learning_rate": 9.998895454797807e-06,
"step_time_sec": 104.74
},
{
"step": 140,
"epoch": 0.0757114851619009,
"wallclock": "2026-06-24T04:42:06.675192",
"loss": 0.1013,
"grad_norm": 0.42517712712287903,
"learning_rate": 9.998387314618898e-06,
"step_time_sec": 105.86
},
{
"step": 145,
"epoch": 0.07841546677482593,
"wallclock": "2026-06-24T04:43:52.811351",
"loss": 0.1198,
"grad_norm": 0.9795618653297424,
"learning_rate": 9.997783321329104e-06,
"step_time_sec": 106.14
},
{
"step": 150,
"epoch": 0.08111944838775097,
"wallclock": "2026-06-24T04:45:37.500620",
"loss": 0.1097,
"grad_norm": 0.8131667375564575,
"learning_rate": 9.997083486511088e-06,
"step_time_sec": 104.69,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 155,
"epoch": 0.08382343000067599,
"wallclock": "2026-06-24T04:47:23.099655",
"loss": 0.1064,
"grad_norm": 1.0449674129486084,
"learning_rate": 9.996287823585446e-06,
"step_time_sec": 105.6
},
{
"step": 160,
"epoch": 0.08652741161360103,
"wallclock": "2026-06-24T04:49:08.308222",
"loss": 0.1413,
"grad_norm": 1.2222431898117065,
"learning_rate": 9.995396347810456e-06,
"step_time_sec": 105.21
},
{
"step": 165,
"epoch": 0.08923139322652605,
"wallclock": "2026-06-24T04:50:53.614369",
"loss": 0.095,
"grad_norm": 0.6692535877227783,
"learning_rate": 9.994409076281776e-06,
"step_time_sec": 105.31
},
{
"step": 170,
"epoch": 0.09193537483945109,
"wallclock": "2026-06-24T04:52:39.312428",
"loss": 0.0987,
"grad_norm": 0.7257323861122131,
"learning_rate": 9.99332602793212e-06,
"step_time_sec": 105.7
},
{
"step": 175,
"epoch": 0.09463935645237613,
"wallclock": "2026-06-24T04:54:25.104575",
"loss": 0.1172,
"grad_norm": 0.694538414478302,
"learning_rate": 9.992147223530901e-06,
"step_time_sec": 105.79
},
{
"step": 180,
"epoch": 0.09734333806530115,
"wallclock": "2026-06-24T04:56:09.832372",
"loss": 0.0912,
"grad_norm": 0.5451284646987915,
"learning_rate": 9.99087268568382e-06,
"step_time_sec": 104.73
},
{
"step": 185,
"epoch": 0.1000473196782262,
"wallclock": "2026-06-24T04:57:55.711549",
"loss": 0.1088,
"grad_norm": 0.7407487034797668,
"learning_rate": 9.989502438832447e-06,
"step_time_sec": 105.88
},
{
"step": 190,
"epoch": 0.10275130129115122,
"wallclock": "2026-06-24T04:59:41.200392",
"loss": 0.1331,
"grad_norm": 0.6032689809799194,
"learning_rate": 9.988036509253742e-06,
"step_time_sec": 105.49
},
{
"step": 195,
"epoch": 0.10545528290407626,
"wallclock": "2026-06-24T05:01:24.975629",
"loss": 0.091,
"grad_norm": 0.7505941390991211,
"learning_rate": 9.986474925059551e-06,
"step_time_sec": 103.78
},
{
"step": 200,
"epoch": 0.10815926451700128,
"wallclock": "2026-06-24T05:03:10.604700",
"loss": 0.1116,
"grad_norm": 0.6309108138084412,
"learning_rate": 9.984817716196075e-06,
"step_time_sec": 105.63,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 200,
"epoch": 0.10815926451700128,
"wallclock": "2026-06-24T05:04:50.512683",
"eval_loss": 0.08997273445129395,
"eval_runtime": 99.9051,
"eval_samples_per_second": 5.005,
"eval_steps_per_second": 1.251,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 205,
"epoch": 0.11086324612992632,
"wallclock": "2026-06-24T05:07:50.102603",
"loss": 0.0916,
"grad_norm": 0.4750153422355652,
"learning_rate": 9.983064914443293e-06,
"step_time_sec": 279.5
},
{
"step": 210,
"epoch": 0.11356722774285134,
"wallclock": "2026-06-24T05:09:36.370868",
"loss": 0.1137,
"grad_norm": 0.6414338946342468,
"learning_rate": 9.981216553414342e-06,
"step_time_sec": 106.27
},
{
"step": 215,
"epoch": 0.11627120935577638,
"wallclock": "2026-06-24T05:11:30.304900",
"loss": 0.1261,
"grad_norm": 0.7359138131141663,
"learning_rate": 9.979272668554885e-06,
"step_time_sec": 113.93
},
{
"step": 220,
"epoch": 0.11897519096870142,
"wallclock": "2026-06-24T05:13:14.700481",
"loss": 0.1016,
"grad_norm": 0.8333423137664795,
"learning_rate": 9.97723329714243e-06,
"step_time_sec": 104.4
},
{
"step": 225,
"epoch": 0.12167917258162644,
"wallclock": "2026-06-24T05:14:58.874076",
"loss": 0.1084,
"grad_norm": 0.7175215482711792,
"learning_rate": 9.97509847828561e-06,
"step_time_sec": 104.17
},
{
"step": 230,
"epoch": 0.12438315419455148,
"wallclock": "2026-06-24T05:16:44.695629",
"loss": 0.1165,
"grad_norm": 0.5170373320579529,
"learning_rate": 9.972868252923433e-06,
"step_time_sec": 105.82
},
{
"step": 235,
"epoch": 0.12708713580747652,
"wallclock": "2026-06-24T05:18:30.420350",
"loss": 0.1014,
"grad_norm": 1.0086610317230225,
"learning_rate": 9.970542663824504e-06,
"step_time_sec": 105.72
},
{
"step": 240,
"epoch": 0.12979111742040153,
"wallclock": "2026-06-24T05:20:15.400259",
"loss": 0.102,
"grad_norm": 0.6341211199760437,
"learning_rate": 9.968121755586196e-06,
"step_time_sec": 104.98
},
{
"step": 245,
"epoch": 0.13249509903332657,
"wallclock": "2026-06-24T05:22:01.027986",
"loss": 0.1059,
"grad_norm": 0.7365284562110901,
"learning_rate": 9.965605574633798e-06,
"step_time_sec": 105.63
},
{
"step": 250,
"epoch": 0.1351990806462516,
"wallclock": "2026-06-24T05:23:47.012942",
"loss": 0.0803,
"grad_norm": 0.6329382061958313,
"learning_rate": 9.96299416921963e-06,
"step_time_sec": 105.98,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 255,
"epoch": 0.13790306225917665,
"wallclock": "2026-06-24T05:25:33.107736",
"loss": 0.1085,
"grad_norm": 0.6384260654449463,
"learning_rate": 9.960287589422111e-06,
"step_time_sec": 106.09
},
{
"step": 260,
"epoch": 0.14060704387210168,
"wallclock": "2026-06-24T05:27:19.205894",
"loss": 0.0953,
"grad_norm": 0.7029681205749512,
"learning_rate": 9.957485887144797e-06,
"step_time_sec": 106.1
},
{
"step": 265,
"epoch": 0.1433110254850267,
"wallclock": "2026-06-24T05:29:05.191995",
"loss": 0.0855,
"grad_norm": 0.7882628440856934,
"learning_rate": 9.954589116115398e-06,
"step_time_sec": 105.99
},
{
"step": 270,
"epoch": 0.14601500709795173,
"wallclock": "2026-06-24T05:30:49.706053",
"loss": 0.0996,
"grad_norm": 0.8211791515350342,
"learning_rate": 9.95159733188473e-06,
"step_time_sec": 104.51
},
{
"step": 275,
"epoch": 0.14871898871087677,
"wallclock": "2026-06-24T05:32:37.121576",
"loss": 0.1183,
"grad_norm": 0.8806095719337463,
"learning_rate": 9.948510591825666e-06,
"step_time_sec": 107.42
},
{
"step": 280,
"epoch": 0.1514229703238018,
"wallclock": "2026-06-24T05:34:21.610766",
"loss": 0.0954,
"grad_norm": 0.7867270708084106,
"learning_rate": 9.945328955132023e-06,
"step_time_sec": 104.49
},
{
"step": 285,
"epoch": 0.15412695193672682,
"wallclock": "2026-06-24T05:36:07.329379",
"loss": 0.0886,
"grad_norm": 0.7445922493934631,
"learning_rate": 9.942052482817436e-06,
"step_time_sec": 105.72
},
{
"step": 290,
"epoch": 0.15683093354965186,
"wallclock": "2026-06-24T05:37:53.691605",
"loss": 0.0762,
"grad_norm": 0.4893661141395569,
"learning_rate": 9.938681237714186e-06,
"step_time_sec": 106.36
},
{
"step": 295,
"epoch": 0.1595349151625769,
"wallclock": "2026-06-24T05:39:38.617866",
"loss": 0.1037,
"grad_norm": 0.7313506603240967,
"learning_rate": 9.935215284471989e-06,
"step_time_sec": 104.93
},
{
"step": 300,
"epoch": 0.16223889677550193,
"wallclock": "2026-06-24T05:41:23.828815",
"loss": 0.0868,
"grad_norm": 0.7617091536521912,
"learning_rate": 9.93165468955676e-06,
"step_time_sec": 105.21,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 300,
"epoch": 0.16223889677550193,
"wallclock": "2026-06-24T05:43:03.826680",
"eval_loss": 0.0827580988407135,
"eval_runtime": 99.9942,
"eval_samples_per_second": 5.0,
"eval_steps_per_second": 1.25,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 305,
"epoch": 0.16494287838842697,
"wallclock": "2026-06-24T05:46:13.578643",
"loss": 0.074,
"grad_norm": 0.4504067301750183,
"learning_rate": 9.927999521249347e-06,
"step_time_sec": 289.75
},
{
"step": 310,
"epoch": 0.16764686000135198,
"wallclock": "2026-06-24T05:48:00.303575",
"loss": 0.1073,
"grad_norm": 0.6431950330734253,
"learning_rate": 9.924249849644205e-06,
"step_time_sec": 106.72
},
{
"step": 315,
"epoch": 0.17035084161427702,
"wallclock": "2026-06-24T05:49:45.680208",
"loss": 0.1112,
"grad_norm": 0.9043431878089905,
"learning_rate": 9.920405746648067e-06,
"step_time_sec": 105.38
},
{
"step": 320,
"epoch": 0.17305482322720206,
"wallclock": "2026-06-24T05:51:32.404247",
"loss": 0.0764,
"grad_norm": 0.6045661568641663,
"learning_rate": 9.916467285978556e-06,
"step_time_sec": 106.72
},
{
"step": 325,
"epoch": 0.1757588048401271,
"wallclock": "2026-06-24T05:53:18.181343",
"loss": 0.0956,
"grad_norm": 0.8464241623878479,
"learning_rate": 9.912434543162769e-06,
"step_time_sec": 105.78
},
{
"step": 330,
"epoch": 0.1784627864530521,
"wallclock": "2026-06-24T05:55:03.598110",
"loss": 0.1038,
"grad_norm": 0.43105682730674744,
"learning_rate": 9.908307595535842e-06,
"step_time_sec": 105.42
},
{
"step": 335,
"epoch": 0.18116676806597715,
"wallclock": "2026-06-24T05:56:50.010277",
"loss": 0.1136,
"grad_norm": 0.4054422676563263,
"learning_rate": 9.904086522239455e-06,
"step_time_sec": 106.41
},
{
"step": 340,
"epoch": 0.18387074967890218,
"wallclock": "2026-06-24T05:58:35.002723",
"loss": 0.081,
"grad_norm": 0.7263162732124329,
"learning_rate": 9.899771404220318e-06,
"step_time_sec": 104.99
},
{
"step": 345,
"epoch": 0.18657473129182722,
"wallclock": "2026-06-24T06:00:19.132697",
"loss": 0.1075,
"grad_norm": 1.7756342887878418,
"learning_rate": 9.895362324228616e-06,
"step_time_sec": 104.13
},
{
"step": 350,
"epoch": 0.18927871290475226,
"wallclock": "2026-06-24T06:02:04.923652",
"loss": 0.0862,
"grad_norm": 0.4385850131511688,
"learning_rate": 9.890859366816429e-06,
"step_time_sec": 105.79,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 355,
"epoch": 0.19198269451767727,
"wallclock": "2026-06-24T06:03:51.291232",
"loss": 0.1074,
"grad_norm": 0.5257728099822998,
"learning_rate": 9.886262618336103e-06,
"step_time_sec": 106.37
},
{
"step": 360,
"epoch": 0.1946866761306023,
"wallclock": "2026-06-24T06:05:36.306087",
"loss": 0.1147,
"grad_norm": 0.6658884882926941,
"learning_rate": 9.881572166938598e-06,
"step_time_sec": 105.01
},
{
"step": 365,
"epoch": 0.19739065774352735,
"wallclock": "2026-06-24T06:07:22.917361",
"loss": 0.0827,
"grad_norm": 0.5998860597610474,
"learning_rate": 9.876788102571797e-06,
"step_time_sec": 106.61
},
{
"step": 370,
"epoch": 0.2000946393564524,
"wallclock": "2026-06-24T06:09:08.111845",
"loss": 0.1197,
"grad_norm": 0.8167080879211426,
"learning_rate": 9.871910516978782e-06,
"step_time_sec": 105.19
},
{
"step": 375,
"epoch": 0.2027986209693774,
"wallclock": "2026-06-24T06:10:53.235112",
"loss": 0.1089,
"grad_norm": 0.8197498321533203,
"learning_rate": 9.86693950369607e-06,
"step_time_sec": 105.12
},
{
"step": 380,
"epoch": 0.20550260258230243,
"wallclock": "2026-06-24T06:12:39.106309",
"loss": 0.0881,
"grad_norm": 0.5486798882484436,
"learning_rate": 9.861875158051831e-06,
"step_time_sec": 105.87
},
{
"step": 385,
"epoch": 0.20820658419522747,
"wallclock": "2026-06-24T06:14:24.784134",
"loss": 0.0849,
"grad_norm": 0.6048823595046997,
"learning_rate": 9.85671757716404e-06,
"step_time_sec": 105.68
},
{
"step": 390,
"epoch": 0.2109105658081525,
"wallclock": "2026-06-24T06:16:09.415638",
"loss": 0.1078,
"grad_norm": 0.4019126296043396,
"learning_rate": 9.851466859938637e-06,
"step_time_sec": 104.63
},
{
"step": 395,
"epoch": 0.21361454742107755,
"wallclock": "2026-06-24T06:17:56.691186",
"loss": 0.1119,
"grad_norm": 0.6954424381256104,
"learning_rate": 9.84612310706761e-06,
"step_time_sec": 107.28
},
{
"step": 400,
"epoch": 0.21631852903400256,
"wallclock": "2026-06-24T06:19:42.292515",
"loss": 0.0945,
"grad_norm": 0.6359832882881165,
"learning_rate": 9.840686421027085e-06,
"step_time_sec": 105.6,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 400,
"epoch": 0.21631852903400256,
"wallclock": "2026-06-24T06:21:22.500688",
"eval_loss": 0.08142668008804321,
"eval_runtime": 100.2043,
"eval_samples_per_second": 4.99,
"eval_steps_per_second": 1.247,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 405,
"epoch": 0.2190225106469276,
"wallclock": "2026-06-24T06:24:29.504207",
"loss": 0.0882,
"grad_norm": 0.6086682081222534,
"learning_rate": 9.835156906075338e-06,
"step_time_sec": 287.21
},
{
"step": 410,
"epoch": 0.22172649225985264,
"wallclock": "2026-06-24T06:26:16.103755",
"loss": 0.1007,
"grad_norm": 1.0316163301467896,
"learning_rate": 9.829534668250814e-06,
"step_time_sec": 106.6
},
{
"step": 415,
"epoch": 0.22443047387277767,
"wallclock": "2026-06-24T06:28:01.409742",
"loss": 0.093,
"grad_norm": 0.7095230221748352,
"learning_rate": 9.823819815370084e-06,
"step_time_sec": 105.31
},
{
"step": 420,
"epoch": 0.22713445548570269,
"wallclock": "2026-06-24T06:29:47.402659",
"loss": 0.1019,
"grad_norm": 0.7305953502655029,
"learning_rate": 9.818012457025782e-06,
"step_time_sec": 105.99
},
{
"step": 425,
"epoch": 0.22983843709862772,
"wallclock": "2026-06-24T06:31:34.377021",
"loss": 0.1021,
"grad_norm": 0.5319082140922546,
"learning_rate": 9.812112704584503e-06,
"step_time_sec": 106.97
},
{
"step": 430,
"epoch": 0.23254241871155276,
"wallclock": "2026-06-24T06:33:19.008040",
"loss": 0.1063,
"grad_norm": 0.8568723797798157,
"learning_rate": 9.806120671184658e-06,
"step_time_sec": 104.63
},
{
"step": 435,
"epoch": 0.2352464003244778,
"wallclock": "2026-06-24T06:35:03.722758",
"loss": 0.0605,
"grad_norm": 0.45772790908813477,
"learning_rate": 9.80003647173432e-06,
"step_time_sec": 104.71
},
{
"step": 440,
"epoch": 0.23795038193740284,
"wallclock": "2026-06-24T06:36:50.225935",
"loss": 0.0991,
"grad_norm": 0.7904582023620605,
"learning_rate": 9.793860222909012e-06,
"step_time_sec": 106.5
},
{
"step": 445,
"epoch": 0.24065436355032785,
"wallclock": "2026-06-24T06:38:35.809144",
"loss": 0.0913,
"grad_norm": 0.5529101490974426,
"learning_rate": 9.787592043149467e-06,
"step_time_sec": 105.58
},
{
"step": 450,
"epoch": 0.2433583451632529,
"wallclock": "2026-06-24T06:40:20.203877",
"loss": 0.0679,
"grad_norm": 0.7900363206863403,
"learning_rate": 9.78123205265936e-06,
"step_time_sec": 104.39,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 455,
"epoch": 0.24606232677617793,
"wallclock": "2026-06-24T06:42:06.492257",
"loss": 0.1056,
"grad_norm": 0.6831784248352051,
"learning_rate": 9.774780373403003e-06,
"step_time_sec": 106.29
},
{
"step": 460,
"epoch": 0.24876630838910296,
"wallclock": "2026-06-24T06:43:52.409118",
"loss": 0.0933,
"grad_norm": 0.5478017330169678,
"learning_rate": 9.768237129103009e-06,
"step_time_sec": 105.92
},
{
"step": 465,
"epoch": 0.251470290002028,
"wallclock": "2026-06-24T06:45:37.500906",
"loss": 0.089,
"grad_norm": 0.4542732238769531,
"learning_rate": 9.761602445237914e-06,
"step_time_sec": 105.09
},
{
"step": 470,
"epoch": 0.25417427161495304,
"wallclock": "2026-06-24T06:47:23.183901",
"loss": 0.1057,
"grad_norm": 2.437464714050293,
"learning_rate": 9.75487644903977e-06,
"step_time_sec": 105.68
},
{
"step": 475,
"epoch": 0.25687825322787805,
"wallclock": "2026-06-24T06:49:09.011851",
"loss": 0.0774,
"grad_norm": 0.6896166801452637,
"learning_rate": 9.748059269491711e-06,
"step_time_sec": 105.83
},
{
"step": 480,
"epoch": 0.25958223484080306,
"wallclock": "2026-06-24T06:50:54.196439",
"loss": 0.0913,
"grad_norm": 0.5685729384422302,
"learning_rate": 9.741151037325481e-06,
"step_time_sec": 105.18
},
{
"step": 485,
"epoch": 0.2622862164537281,
"wallclock": "2026-06-24T06:52:39.978133",
"loss": 0.086,
"grad_norm": 0.8516511917114258,
"learning_rate": 9.73415188501891e-06,
"step_time_sec": 105.78
},
{
"step": 490,
"epoch": 0.26499019806665314,
"wallclock": "2026-06-24T06:54:27.309828",
"loss": 0.0872,
"grad_norm": 0.7482581734657288,
"learning_rate": 9.727061946793402e-06,
"step_time_sec": 107.33
},
{
"step": 495,
"epoch": 0.2676941796795782,
"wallclock": "2026-06-24T06:56:12.188135",
"loss": 0.0733,
"grad_norm": 0.544495701789856,
"learning_rate": 9.71988135861133e-06,
"step_time_sec": 104.88
},
{
"step": 500,
"epoch": 0.2703981612925032,
"wallclock": "2026-06-24T06:57:57.321125",
"loss": 0.0771,
"grad_norm": 0.6160959005355835,
"learning_rate": 9.712610258173453e-06,
"step_time_sec": 105.13,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.98
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 500,
"epoch": 0.2703981612925032,
"wallclock": "2026-06-24T06:59:37.554773",
"eval_loss": 0.0792667418718338,
"eval_runtime": 100.2297,
"eval_samples_per_second": 4.989,
"eval_steps_per_second": 1.247,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 505,
"epoch": 0.2731021429054282,
"wallclock": "2026-06-24T07:02:45.894476",
"loss": 0.0798,
"grad_norm": 0.8482615351676941,
"learning_rate": 9.705248784916267e-06,
"step_time_sec": 288.57
},
{
"step": 510,
"epoch": 0.2758061245183533,
"wallclock": "2026-06-24T07:04:30.505457",
"loss": 0.0955,
"grad_norm": 0.5648516416549683,
"learning_rate": 9.697797080009323e-06,
"step_time_sec": 104.61
},
{
"step": 515,
"epoch": 0.2785101061312783,
"wallclock": "2026-06-24T07:06:15.804916",
"loss": 0.082,
"grad_norm": 0.6227542757987976,
"learning_rate": 9.690255286352532e-06,
"step_time_sec": 105.3
},
{
"step": 520,
"epoch": 0.28121408774420337,
"wallclock": "2026-06-24T07:08:01.704738",
"loss": 0.1104,
"grad_norm": 0.7219036221504211,
"learning_rate": 9.682623548573418e-06,
"step_time_sec": 105.9
},
{
"step": 525,
"epoch": 0.2839180693571284,
"wallclock": "2026-06-24T07:09:47.411077",
"loss": 0.0873,
"grad_norm": 0.5870639681816101,
"learning_rate": 9.674902013024348e-06,
"step_time_sec": 105.71
},
{
"step": 530,
"epoch": 0.2866220509700534,
"wallclock": "2026-06-24T07:11:33.115669",
"loss": 0.1001,
"grad_norm": 0.5214188694953918,
"learning_rate": 9.667090827779721e-06,
"step_time_sec": 105.7
},
{
"step": 535,
"epoch": 0.28932603258297845,
"wallclock": "2026-06-24T07:13:18.886097",
"loss": 0.0888,
"grad_norm": 0.5477219223976135,
"learning_rate": 9.659190142633133e-06,
"step_time_sec": 105.77
},
{
"step": 540,
"epoch": 0.29203001419590346,
"wallclock": "2026-06-24T07:15:03.294899",
"loss": 0.0893,
"grad_norm": 0.6372500061988831,
"learning_rate": 9.651200109094498e-06,
"step_time_sec": 104.41
},
{
"step": 545,
"epoch": 0.2947339958088285,
"wallclock": "2026-06-24T07:16:49.807494",
"loss": 0.0706,
"grad_norm": 0.6478589177131653,
"learning_rate": 9.643120880387155e-06,
"step_time_sec": 106.51
},
{
"step": 550,
"epoch": 0.29743797742175354,
"wallclock": "2026-06-24T07:18:36.819978",
"loss": 0.0848,
"grad_norm": 0.7352571487426758,
"learning_rate": 9.634952611444914e-06,
"step_time_sec": 107.01,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 555,
"epoch": 0.30014195903467855,
"wallclock": "2026-06-24T07:20:22.813541",
"loss": 0.0956,
"grad_norm": 0.8457625508308411,
"learning_rate": 9.626695458909098e-06,
"step_time_sec": 105.99
},
{
"step": 560,
"epoch": 0.3028459406476036,
"wallclock": "2026-06-24T07:22:08.475130",
"loss": 0.082,
"grad_norm": 0.8473530411720276,
"learning_rate": 9.618349581125529e-06,
"step_time_sec": 105.66
},
{
"step": 565,
"epoch": 0.30554992226052863,
"wallclock": "2026-06-24T07:23:54.730055",
"loss": 0.0666,
"grad_norm": 0.7220405340194702,
"learning_rate": 9.609915138141497e-06,
"step_time_sec": 106.25
},
{
"step": 570,
"epoch": 0.30825390387345364,
"wallclock": "2026-06-24T07:25:39.415962",
"loss": 0.0714,
"grad_norm": 0.6538407206535339,
"learning_rate": 9.601392291702693e-06,
"step_time_sec": 104.69
},
{
"step": 575,
"epoch": 0.3109578854863787,
"wallclock": "2026-06-24T07:27:23.791188",
"loss": 0.073,
"grad_norm": 0.851050853729248,
"learning_rate": 9.592781205250102e-06,
"step_time_sec": 104.38
},
{
"step": 580,
"epoch": 0.3136618670993037,
"wallclock": "2026-06-24T07:29:08.982414",
"loss": 0.0972,
"grad_norm": 0.7455153465270996,
"learning_rate": 9.584082043916867e-06,
"step_time_sec": 105.19
},
{
"step": 585,
"epoch": 0.3163658487122288,
"wallclock": "2026-06-24T07:30:54.304933",
"loss": 0.0728,
"grad_norm": 0.39666956663131714,
"learning_rate": 9.575294974525131e-06,
"step_time_sec": 105.32
},
{
"step": 590,
"epoch": 0.3190698303251538,
"wallclock": "2026-06-24T07:32:40.198440",
"loss": 0.0719,
"grad_norm": 0.283635675907135,
"learning_rate": 9.566420165582832e-06,
"step_time_sec": 105.89
},
{
"step": 595,
"epoch": 0.3217738119380788,
"wallclock": "2026-06-24T07:34:26.091391",
"loss": 0.0892,
"grad_norm": 0.6910920739173889,
"learning_rate": 9.557457787280474e-06,
"step_time_sec": 105.89
},
{
"step": 600,
"epoch": 0.32447779355100387,
"wallclock": "2026-06-24T07:36:12.209290",
"loss": 0.0895,
"grad_norm": 0.6658245325088501,
"learning_rate": 9.548408011487857e-06,
"step_time_sec": 106.12,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 600,
"epoch": 0.32447779355100387,
"wallclock": "2026-06-24T07:37:52.689311",
"eval_loss": 0.07775916159152985,
"eval_runtime": 100.4763,
"eval_samples_per_second": 4.976,
"eval_steps_per_second": 1.244,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 605,
"epoch": 0.3271817751639289,
"wallclock": "2026-06-24T07:40:59.587964",
"loss": 0.0984,
"grad_norm": 0.6905266046524048,
"learning_rate": 9.539271011750787e-06,
"step_time_sec": 287.38
},
{
"step": 610,
"epoch": 0.32988575677685394,
"wallclock": "2026-06-24T07:42:47.180462",
"loss": 0.0823,
"grad_norm": 0.6062604784965515,
"learning_rate": 9.530046963287753e-06,
"step_time_sec": 107.59
},
{
"step": 615,
"epoch": 0.33258973838977895,
"wallclock": "2026-06-24T07:44:31.276448",
"loss": 0.0767,
"grad_norm": 0.7175081968307495,
"learning_rate": 9.520736042986551e-06,
"step_time_sec": 104.1
},
{
"step": 620,
"epoch": 0.33529372000270397,
"wallclock": "2026-06-24T07:46:15.885770",
"loss": 0.0882,
"grad_norm": 0.898894190788269,
"learning_rate": 9.51133842940091e-06,
"step_time_sec": 104.61
},
{
"step": 625,
"epoch": 0.33799770161562903,
"wallclock": "2026-06-24T07:48:01.530170",
"loss": 0.0873,
"grad_norm": 0.5385039448738098,
"learning_rate": 9.501854302747053e-06,
"step_time_sec": 105.64
},
{
"step": 630,
"epoch": 0.34070168322855404,
"wallclock": "2026-06-24T07:49:46.498449",
"loss": 0.0817,
"grad_norm": 0.5420588850975037,
"learning_rate": 9.492283844900255e-06,
"step_time_sec": 104.97
},
{
"step": 635,
"epoch": 0.34340566484147905,
"wallclock": "2026-06-24T07:51:31.809385",
"loss": 0.0879,
"grad_norm": 1.3086037635803223,
"learning_rate": 9.482627239391335e-06,
"step_time_sec": 105.31
},
{
"step": 640,
"epoch": 0.3461096464544041,
"wallclock": "2026-06-24T07:53:18.306307",
"loss": 0.0728,
"grad_norm": 0.6617655158042908,
"learning_rate": 9.472884671403164e-06,
"step_time_sec": 106.5
},
{
"step": 645,
"epoch": 0.34881362806732913,
"wallclock": "2026-06-24T07:55:03.697153",
"loss": 0.0593,
"grad_norm": 0.6209415197372437,
"learning_rate": 9.46305632776709e-06,
"step_time_sec": 105.39
},
{
"step": 650,
"epoch": 0.3515176096802542,
"wallclock": "2026-06-24T07:56:48.600267",
"loss": 0.0816,
"grad_norm": 1.021694302558899,
"learning_rate": 9.453142396959364e-06,
"step_time_sec": 104.9,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 655,
"epoch": 0.3542215912931792,
"wallclock": "2026-06-24T07:58:33.519930",
"loss": 0.0802,
"grad_norm": 0.8565160036087036,
"learning_rate": 9.443143069097531e-06,
"step_time_sec": 104.92
},
{
"step": 660,
"epoch": 0.3569255729061042,
"wallclock": "2026-06-24T08:00:17.709412",
"loss": 0.073,
"grad_norm": 0.8225128650665283,
"learning_rate": 9.433058535936775e-06,
"step_time_sec": 104.19
},
{
"step": 665,
"epoch": 0.3596295545190293,
"wallclock": "2026-06-24T08:02:02.897175",
"loss": 0.0805,
"grad_norm": 0.8371864557266235,
"learning_rate": 9.422888990866243e-06,
"step_time_sec": 105.19
},
{
"step": 670,
"epoch": 0.3623335361319543,
"wallclock": "2026-06-24T08:03:49.320420",
"loss": 0.0855,
"grad_norm": 0.6681428551673889,
"learning_rate": 9.412634628905345e-06,
"step_time_sec": 106.42
},
{
"step": 675,
"epoch": 0.36503751774487936,
"wallclock": "2026-06-24T08:05:33.903831",
"loss": 0.0768,
"grad_norm": 0.6769019365310669,
"learning_rate": 9.402295646700005e-06,
"step_time_sec": 104.58
},
{
"step": 680,
"epoch": 0.36774149935780437,
"wallclock": "2026-06-24T08:07:19.895810",
"loss": 0.0829,
"grad_norm": 0.5479181408882141,
"learning_rate": 9.391872242518895e-06,
"step_time_sec": 105.99
},
{
"step": 685,
"epoch": 0.3704454809707294,
"wallclock": "2026-06-24T08:09:05.010792",
"loss": 0.0745,
"grad_norm": 0.499809205532074,
"learning_rate": 9.381364616249627e-06,
"step_time_sec": 105.11
},
{
"step": 690,
"epoch": 0.37314946258365445,
"wallclock": "2026-06-24T08:10:50.298200",
"loss": 0.0735,
"grad_norm": 1.0203771591186523,
"learning_rate": 9.370772969394927e-06,
"step_time_sec": 105.29
},
{
"step": 695,
"epoch": 0.37585344419657946,
"wallclock": "2026-06-24T08:12:35.415602",
"loss": 0.0705,
"grad_norm": 0.7761706113815308,
"learning_rate": 9.360097505068767e-06,
"step_time_sec": 105.12
},
{
"step": 700,
"epoch": 0.3785574258095045,
"wallclock": "2026-06-24T08:14:21.710932",
"loss": 0.0984,
"grad_norm": 2.708282709121704,
"learning_rate": 9.349338427992471e-06,
"step_time_sec": 106.3,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 700,
"epoch": 0.3785574258095045,
"wallclock": "2026-06-24T08:16:02.035763",
"eval_loss": 0.07459608465433121,
"eval_runtime": 100.3142,
"eval_samples_per_second": 4.984,
"eval_steps_per_second": 1.246,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 705,
"epoch": 0.38126140742242953,
"wallclock": "2026-06-24T08:19:08.313913",
"loss": 0.0745,
"grad_norm": 0.4694831073284149,
"learning_rate": 9.338495944490788e-06,
"step_time_sec": 286.6
},
{
"step": 710,
"epoch": 0.38396538903535454,
"wallclock": "2026-06-24T08:20:53.768411",
"loss": 0.1006,
"grad_norm": 0.715364396572113,
"learning_rate": 9.327570262487934e-06,
"step_time_sec": 105.45
},
{
"step": 715,
"epoch": 0.3866693706482796,
"wallclock": "2026-06-24T08:22:39.591734",
"loss": 0.0863,
"grad_norm": 0.5401411056518555,
"learning_rate": 9.316561591503612e-06,
"step_time_sec": 105.82
},
{
"step": 720,
"epoch": 0.3893733522612046,
"wallclock": "2026-06-24T08:24:24.650372",
"loss": 0.0955,
"grad_norm": 0.5890225768089294,
"learning_rate": 9.305470142648982e-06,
"step_time_sec": 105.06
},
{
"step": 725,
"epoch": 0.39207733387412963,
"wallclock": "2026-06-24T08:26:09.603072",
"loss": 0.0733,
"grad_norm": 0.6464399695396423,
"learning_rate": 9.294296128622625e-06,
"step_time_sec": 104.95
},
{
"step": 730,
"epoch": 0.3947813154870547,
"wallclock": "2026-06-24T08:27:56.312801",
"loss": 0.0723,
"grad_norm": 0.5359171628952026,
"learning_rate": 9.283039763706455e-06,
"step_time_sec": 106.71
},
{
"step": 735,
"epoch": 0.3974852970999797,
"wallclock": "2026-06-24T08:29:41.888724",
"loss": 0.0767,
"grad_norm": 0.7463257908821106,
"learning_rate": 9.27170126376161e-06,
"step_time_sec": 105.58
},
{
"step": 740,
"epoch": 0.4001892787129048,
"wallclock": "2026-06-24T08:31:26.792262",
"loss": 0.0696,
"grad_norm": 0.8311108946800232,
"learning_rate": 9.260280846224328e-06,
"step_time_sec": 104.9
},
{
"step": 745,
"epoch": 0.4028932603258298,
"wallclock": "2026-06-24T08:33:12.308668",
"loss": 0.0674,
"grad_norm": 0.7888720631599426,
"learning_rate": 9.24877873010175e-06,
"step_time_sec": 105.52
},
{
"step": 750,
"epoch": 0.4055972419387548,
"wallclock": "2026-06-24T08:34:57.915757",
"loss": 0.0689,
"grad_norm": 0.5358040928840637,
"learning_rate": 9.237195135967746e-06,
"step_time_sec": 105.61,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 755,
"epoch": 0.40830122355167986,
"wallclock": "2026-06-24T08:36:43.108457",
"loss": 0.0991,
"grad_norm": 0.7390076518058777,
"learning_rate": 9.225530285958669e-06,
"step_time_sec": 105.19
},
{
"step": 760,
"epoch": 0.41100520516460487,
"wallclock": "2026-06-24T08:38:29.912596",
"loss": 0.0848,
"grad_norm": 0.6667785048484802,
"learning_rate": 9.213784403769097e-06,
"step_time_sec": 106.8
},
{
"step": 765,
"epoch": 0.41370918677752994,
"wallclock": "2026-06-24T08:40:15.181668",
"loss": 0.064,
"grad_norm": 1.133137583732605,
"learning_rate": 9.201957714647554e-06,
"step_time_sec": 105.27
},
{
"step": 770,
"epoch": 0.41641316839045495,
"wallclock": "2026-06-24T08:42:01.118538",
"loss": 0.0654,
"grad_norm": 0.8551876544952393,
"learning_rate": 9.19005044539218e-06,
"step_time_sec": 105.94
},
{
"step": 775,
"epoch": 0.41911715000337996,
"wallclock": "2026-06-24T08:43:47.610874",
"loss": 0.0846,
"grad_norm": 0.7466854453086853,
"learning_rate": 9.178062824346383e-06,
"step_time_sec": 106.49
},
{
"step": 780,
"epoch": 0.421821131616305,
"wallclock": "2026-06-24T08:45:33.303287",
"loss": 0.0912,
"grad_norm": 0.5469369292259216,
"learning_rate": 9.165995081394463e-06,
"step_time_sec": 105.69
},
{
"step": 785,
"epoch": 0.42452511322923003,
"wallclock": "2026-06-24T08:47:18.578926",
"loss": 0.0864,
"grad_norm": 0.9799915552139282,
"learning_rate": 9.153847447957205e-06,
"step_time_sec": 105.28
},
{
"step": 790,
"epoch": 0.4272290948421551,
"wallclock": "2026-06-24T08:49:06.294584",
"loss": 0.0954,
"grad_norm": 0.6794901490211487,
"learning_rate": 9.141620156987432e-06,
"step_time_sec": 107.72
},
{
"step": 795,
"epoch": 0.4299330764550801,
"wallclock": "2026-06-24T08:50:52.095214",
"loss": 0.0547,
"grad_norm": 0.6824802160263062,
"learning_rate": 9.12931344296555e-06,
"step_time_sec": 105.8
},
{
"step": 800,
"epoch": 0.4326370580680051,
"wallclock": "2026-06-24T08:52:36.490555",
"loss": 0.0677,
"grad_norm": 0.5517615675926208,
"learning_rate": 9.116927541895042e-06,
"step_time_sec": 104.4,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 800,
"epoch": 0.4326370580680051,
"wallclock": "2026-06-24T08:54:16.950971",
"eval_loss": 0.07352492958307266,
"eval_runtime": 100.4561,
"eval_samples_per_second": 4.977,
"eval_steps_per_second": 1.244,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 805,
"epoch": 0.4353410396809302,
"wallclock": "2026-06-24T08:57:27.878085",
"loss": 0.0604,
"grad_norm": 0.561560869216919,
"learning_rate": 9.10446269129795e-06,
"step_time_sec": 291.39
},
{
"step": 810,
"epoch": 0.4380450212938552,
"wallclock": "2026-06-24T08:59:12.110183",
"loss": 0.0793,
"grad_norm": 0.784087061882019,
"learning_rate": 9.091919130210313e-06,
"step_time_sec": 104.23
},
{
"step": 815,
"epoch": 0.4407490029067802,
"wallclock": "2026-06-24T09:00:57.619395",
"loss": 0.0523,
"grad_norm": 0.47488337755203247,
"learning_rate": 9.079297099177585e-06,
"step_time_sec": 105.51
},
{
"step": 820,
"epoch": 0.4434529845197053,
"wallclock": "2026-06-24T09:02:44.201504",
"loss": 0.092,
"grad_norm": 0.6607430577278137,
"learning_rate": 9.066596840250024e-06,
"step_time_sec": 106.58
},
{
"step": 825,
"epoch": 0.4461569661326303,
"wallclock": "2026-06-24T09:04:30.495107",
"loss": 0.0559,
"grad_norm": 0.5975196361541748,
"learning_rate": 9.053818596978051e-06,
"step_time_sec": 106.29
},
{
"step": 830,
"epoch": 0.44886094774555535,
"wallclock": "2026-06-24T09:06:14.400764",
"loss": 0.0749,
"grad_norm": 0.5973978042602539,
"learning_rate": 9.040962614407574e-06,
"step_time_sec": 103.91
},
{
"step": 835,
"epoch": 0.45156492935848036,
"wallclock": "2026-06-24T09:08:00.823060",
"loss": 0.0673,
"grad_norm": 0.8808339238166809,
"learning_rate": 9.028029139075297e-06,
"step_time_sec": 106.42
},
{
"step": 840,
"epoch": 0.45426891097140537,
"wallclock": "2026-06-24T09:09:46.093782",
"loss": 0.0975,
"grad_norm": 0.9540690779685974,
"learning_rate": 9.015018419003982e-06,
"step_time_sec": 105.27
},
{
"step": 845,
"epoch": 0.45697289258433044,
"wallclock": "2026-06-24T09:11:31.785014",
"loss": 0.0815,
"grad_norm": 0.7579560875892639,
"learning_rate": 9.001930703697708e-06,
"step_time_sec": 105.69
},
{
"step": 850,
"epoch": 0.45967687419725545,
"wallclock": "2026-06-24T09:13:18.016135",
"loss": 0.077,
"grad_norm": 1.2188389301300049,
"learning_rate": 8.988766244137065e-06,
"step_time_sec": 106.23,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 855,
"epoch": 0.4623808558101805,
"wallclock": "2026-06-24T09:15:04.661348",
"loss": 0.0777,
"grad_norm": 0.7465812563896179,
"learning_rate": 8.975525292774362e-06,
"step_time_sec": 106.65
},
{
"step": 860,
"epoch": 0.4650848374231055,
"wallclock": "2026-06-24T09:16:51.200338",
"loss": 0.065,
"grad_norm": 0.634141206741333,
"learning_rate": 8.962208103528774e-06,
"step_time_sec": 106.54
},
{
"step": 865,
"epoch": 0.46778881903603053,
"wallclock": "2026-06-24T09:18:37.705663",
"loss": 0.0723,
"grad_norm": 0.5434172749519348,
"learning_rate": 8.948814931781472e-06,
"step_time_sec": 106.51
},
{
"step": 870,
"epoch": 0.4704928006489556,
"wallclock": "2026-06-24T09:20:22.682651",
"loss": 0.0674,
"grad_norm": 0.851901650428772,
"learning_rate": 8.935346034370732e-06,
"step_time_sec": 104.98
},
{
"step": 875,
"epoch": 0.4731967822618806,
"wallclock": "2026-06-24T09:22:08.000109",
"loss": 0.0648,
"grad_norm": 0.5568099617958069,
"learning_rate": 8.921801669587005e-06,
"step_time_sec": 105.32
},
{
"step": 880,
"epoch": 0.4759007638748057,
"wallclock": "2026-06-24T09:23:53.107265",
"loss": 0.073,
"grad_norm": 0.726121723651886,
"learning_rate": 8.908182097167965e-06,
"step_time_sec": 105.11
},
{
"step": 885,
"epoch": 0.4786047454877307,
"wallclock": "2026-06-24T09:25:38.993222",
"loss": 0.0676,
"grad_norm": 0.6540066003799438,
"learning_rate": 8.894487578293534e-06,
"step_time_sec": 105.89
},
{
"step": 890,
"epoch": 0.4813087271006557,
"wallclock": "2026-06-24T09:27:25.285677",
"loss": 0.0699,
"grad_norm": 0.5976990461349487,
"learning_rate": 8.880718375580857e-06,
"step_time_sec": 106.29
},
{
"step": 895,
"epoch": 0.48401270871358076,
"wallclock": "2026-06-24T09:29:12.234716",
"loss": 0.0687,
"grad_norm": 0.5673884749412537,
"learning_rate": 8.866874753079286e-06,
"step_time_sec": 106.95
},
{
"step": 900,
"epoch": 0.4867166903265058,
"wallclock": "2026-06-24T09:30:57.290947",
"loss": 0.0865,
"grad_norm": 0.825077474117279,
"learning_rate": 8.852956976265304e-06,
"step_time_sec": 105.06,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 900,
"epoch": 0.4867166903265058,
"wallclock": "2026-06-24T09:32:37.746275",
"eval_loss": 0.0734986960887909,
"eval_runtime": 100.4508,
"eval_samples_per_second": 4.978,
"eval_steps_per_second": 1.244,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 905,
"epoch": 0.4894206719394308,
"wallclock": "2026-06-24T09:35:44.600337",
"loss": 0.0661,
"grad_norm": 0.6024107933044434,
"learning_rate": 8.838965312037435e-06,
"step_time_sec": 287.31
},
{
"step": 910,
"epoch": 0.49212465355235585,
"wallclock": "2026-06-24T09:37:31.615349",
"loss": 0.0864,
"grad_norm": 0.7744714617729187,
"learning_rate": 8.824900028711128e-06,
"step_time_sec": 107.02
},
{
"step": 915,
"epoch": 0.49482863516528086,
"wallclock": "2026-06-24T09:39:17.106852",
"loss": 0.0608,
"grad_norm": 0.6232128143310547,
"learning_rate": 8.810761396013616e-06,
"step_time_sec": 105.49
},
{
"step": 920,
"epoch": 0.4975326167782059,
"wallclock": "2026-06-24T09:41:01.831415",
"loss": 0.0667,
"grad_norm": 0.7434114217758179,
"learning_rate": 8.796549685078732e-06,
"step_time_sec": 104.72
},
{
"step": 925,
"epoch": 0.500236598391131,
"wallclock": "2026-06-24T09:42:49.107296",
"loss": 0.0545,
"grad_norm": 0.5467560887336731,
"learning_rate": 8.782265168441722e-06,
"step_time_sec": 107.28
},
{
"step": 930,
"epoch": 0.502940580004056,
"wallclock": "2026-06-24T09:44:33.913707",
"loss": 0.0608,
"grad_norm": 0.49254247546195984,
"learning_rate": 8.76790812003401e-06,
"step_time_sec": 104.81
},
{
"step": 935,
"epoch": 0.505644561616981,
"wallclock": "2026-06-24T09:46:19.229213",
"loss": 0.0684,
"grad_norm": 0.48784705996513367,
"learning_rate": 8.753478815177947e-06,
"step_time_sec": 105.32
},
{
"step": 940,
"epoch": 0.5083485432299061,
"wallclock": "2026-06-24T09:48:05.411373",
"loss": 0.0731,
"grad_norm": 0.48523765802383423,
"learning_rate": 8.738977530581534e-06,
"step_time_sec": 106.18
},
{
"step": 945,
"epoch": 0.511052524842831,
"wallclock": "2026-06-24T09:49:51.295620",
"loss": 0.0843,
"grad_norm": 1.2344911098480225,
"learning_rate": 8.724404544333111e-06,
"step_time_sec": 105.88
},
{
"step": 950,
"epoch": 0.5137565064557561,
"wallclock": "2026-06-24T09:51:36.494251",
"loss": 0.0754,
"grad_norm": 0.9410877227783203,
"learning_rate": 8.709760135896033e-06,
"step_time_sec": 105.2,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 955,
"epoch": 0.5164604880686812,
"wallclock": "2026-06-24T09:53:21.420428",
"loss": 0.0843,
"grad_norm": 0.3976033329963684,
"learning_rate": 8.695044586103297e-06,
"step_time_sec": 104.93
},
{
"step": 960,
"epoch": 0.5191644696816061,
"wallclock": "2026-06-24T09:55:05.899797",
"loss": 0.0742,
"grad_norm": 0.7704766392707825,
"learning_rate": 8.680258177152166e-06,
"step_time_sec": 104.48
},
{
"step": 965,
"epoch": 0.5218684512945312,
"wallclock": "2026-06-24T09:56:51.000492",
"loss": 0.0685,
"grad_norm": 0.7557464838027954,
"learning_rate": 8.665401192598761e-06,
"step_time_sec": 105.1
},
{
"step": 970,
"epoch": 0.5245724329074563,
"wallclock": "2026-06-24T09:58:36.814098",
"loss": 0.0646,
"grad_norm": 0.8066175580024719,
"learning_rate": 8.65047391735261e-06,
"step_time_sec": 105.81
},
{
"step": 975,
"epoch": 0.5272764145203812,
"wallclock": "2026-06-24T10:00:21.726492",
"loss": 0.0703,
"grad_norm": 1.2292455434799194,
"learning_rate": 8.635476637671197e-06,
"step_time_sec": 104.91
},
{
"step": 980,
"epoch": 0.5299803961333063,
"wallclock": "2026-06-24T10:02:07.309600",
"loss": 0.0722,
"grad_norm": 0.7355031967163086,
"learning_rate": 8.620409641154465e-06,
"step_time_sec": 105.58
},
{
"step": 985,
"epoch": 0.5326843777462313,
"wallclock": "2026-06-24T10:03:54.319283",
"loss": 0.0593,
"grad_norm": 0.9767148494720459,
"learning_rate": 8.605273216739307e-06,
"step_time_sec": 107.01
},
{
"step": 990,
"epoch": 0.5353883593591564,
"wallclock": "2026-06-24T10:05:40.086818",
"loss": 0.0597,
"grad_norm": 0.6078879237174988,
"learning_rate": 8.590067654694017e-06,
"step_time_sec": 105.77
},
{
"step": 995,
"epoch": 0.5380923409720814,
"wallclock": "2026-06-24T10:07:24.909120",
"loss": 0.0819,
"grad_norm": 0.5737846493721008,
"learning_rate": 8.574793246612727e-06,
"step_time_sec": 104.82
},
{
"step": 1000,
"epoch": 0.5407963225850064,
"wallclock": "2026-06-24T10:09:11.121871",
"loss": 0.0704,
"grad_norm": 0.5743271708488464,
"learning_rate": 8.559450285409825e-06,
"step_time_sec": 106.21,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1000,
"epoch": 0.5407963225850064,
"wallclock": "2026-06-24T10:10:51.453842",
"eval_loss": 0.07477952539920807,
"eval_runtime": 100.3276,
"eval_samples_per_second": 4.984,
"eval_steps_per_second": 1.246,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1005,
"epoch": 0.5435003041979315,
"wallclock": "2026-06-24T10:13:58.794483",
"loss": 0.0445,
"grad_norm": 0.3843560218811035,
"learning_rate": 8.544039065314317e-06,
"step_time_sec": 287.67
},
{
"step": 1010,
"epoch": 0.5462042858108564,
"wallclock": "2026-06-24T10:15:44.821711",
"loss": 0.0824,
"grad_norm": 0.788098156452179,
"learning_rate": 8.528559881864209e-06,
"step_time_sec": 106.03
},
{
"step": 1015,
"epoch": 0.5489082674237815,
"wallclock": "2026-06-24T10:17:30.511914",
"loss": 0.0593,
"grad_norm": 0.5601520538330078,
"learning_rate": 8.513013031900814e-06,
"step_time_sec": 105.69
},
{
"step": 1020,
"epoch": 0.5516122490367066,
"wallclock": "2026-06-24T10:19:15.717942",
"loss": 0.0578,
"grad_norm": 0.4331408739089966,
"learning_rate": 8.497398813563086e-06,
"step_time_sec": 105.21
},
{
"step": 1025,
"epoch": 0.5543162306496315,
"wallclock": "2026-06-24T10:21:00.311987",
"loss": 0.0772,
"grad_norm": 0.7381686568260193,
"learning_rate": 8.48171752628188e-06,
"step_time_sec": 104.59
},
{
"step": 1030,
"epoch": 0.5570202122625566,
"wallclock": "2026-06-24T10:22:46.121305",
"loss": 0.052,
"grad_norm": 0.7812600135803223,
"learning_rate": 8.46596947077422e-06,
"step_time_sec": 105.81
},
{
"step": 1035,
"epoch": 0.5597241938754817,
"wallclock": "2026-06-24T10:24:31.783704",
"loss": 0.0757,
"grad_norm": 0.7333759069442749,
"learning_rate": 8.450154949037539e-06,
"step_time_sec": 105.66
},
{
"step": 1040,
"epoch": 0.5624281754884067,
"wallclock": "2026-06-24T10:26:17.283088",
"loss": 0.0588,
"grad_norm": 0.7570787668228149,
"learning_rate": 8.434274264343869e-06,
"step_time_sec": 105.5
},
{
"step": 1045,
"epoch": 0.5651321571013317,
"wallclock": "2026-06-24T10:28:03.723823",
"loss": 0.0491,
"grad_norm": 0.42195039987564087,
"learning_rate": 8.418327721234044e-06,
"step_time_sec": 106.44
},
{
"step": 1050,
"epoch": 0.5678361387142568,
"wallclock": "2026-06-24T10:29:49.107217",
"loss": 0.0524,
"grad_norm": 0.5051612257957458,
"learning_rate": 8.40231562551185e-06,
"step_time_sec": 105.38,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1055,
"epoch": 0.5705401203271818,
"wallclock": "2026-06-24T10:31:34.010033",
"loss": 0.0582,
"grad_norm": 0.6454283595085144,
"learning_rate": 8.386238284238163e-06,
"step_time_sec": 104.9
},
{
"step": 1060,
"epoch": 0.5732441019401068,
"wallclock": "2026-06-24T10:33:20.785609",
"loss": 0.0835,
"grad_norm": 0.9067649841308594,
"learning_rate": 8.37009600572506e-06,
"step_time_sec": 106.78
},
{
"step": 1065,
"epoch": 0.5759480835530318,
"wallclock": "2026-06-24T10:35:06.993088",
"loss": 0.0608,
"grad_norm": 0.5329049825668335,
"learning_rate": 8.35388909952991e-06,
"step_time_sec": 106.21
},
{
"step": 1070,
"epoch": 0.5786520651659569,
"wallclock": "2026-06-24T10:36:51.613853",
"loss": 0.0732,
"grad_norm": 1.0739482641220093,
"learning_rate": 8.337617876449427e-06,
"step_time_sec": 104.62
},
{
"step": 1075,
"epoch": 0.5813560467788819,
"wallclock": "2026-06-24T10:38:38.500861",
"loss": 0.0712,
"grad_norm": 0.5759013295173645,
"learning_rate": 8.321282648513727e-06,
"step_time_sec": 106.89
},
{
"step": 1080,
"epoch": 0.5840600283918069,
"wallclock": "2026-06-24T10:40:24.115300",
"loss": 0.06,
"grad_norm": 0.5534053444862366,
"learning_rate": 8.304883728980325e-06,
"step_time_sec": 105.61
},
{
"step": 1085,
"epoch": 0.586764010004732,
"wallclock": "2026-06-24T10:42:09.998730",
"loss": 0.0613,
"grad_norm": 0.7383453845977783,
"learning_rate": 8.288421432328146e-06,
"step_time_sec": 105.88
},
{
"step": 1090,
"epoch": 0.589467991617657,
"wallclock": "2026-06-24T10:43:56.874611",
"loss": 0.0611,
"grad_norm": 0.5191856026649475,
"learning_rate": 8.271896074251483e-06,
"step_time_sec": 106.88
},
{
"step": 1095,
"epoch": 0.592171973230582,
"wallclock": "2026-06-24T10:45:41.729013",
"loss": 0.0571,
"grad_norm": 0.5893298983573914,
"learning_rate": 8.255307971653941e-06,
"step_time_sec": 104.85
},
{
"step": 1100,
"epoch": 0.5948759548435071,
"wallclock": "2026-06-24T10:47:28.787718",
"loss": 0.0579,
"grad_norm": 0.6628295183181763,
"learning_rate": 8.238657442642375e-06,
"step_time_sec": 107.06,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1100,
"epoch": 0.5948759548435071,
"wallclock": "2026-06-24T10:49:09.237050",
"eval_loss": 0.07260795682668686,
"eval_runtime": 100.4446,
"eval_samples_per_second": 4.978,
"eval_steps_per_second": 1.244,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1105,
"epoch": 0.5975799364564321,
"wallclock": "2026-06-24T10:52:18.585949",
"loss": 0.0701,
"grad_norm": 0.9406186938285828,
"learning_rate": 8.221944806520768e-06,
"step_time_sec": 289.8
},
{
"step": 1110,
"epoch": 0.6002839180693571,
"wallclock": "2026-06-24T10:54:03.822666",
"loss": 0.0709,
"grad_norm": 0.6916760206222534,
"learning_rate": 8.205170383784125e-06,
"step_time_sec": 105.24
},
{
"step": 1115,
"epoch": 0.6029878996822822,
"wallclock": "2026-06-24T10:55:49.426292",
"loss": 0.0591,
"grad_norm": 0.4082253575325012,
"learning_rate": 8.188334496112322e-06,
"step_time_sec": 105.6
},
{
"step": 1120,
"epoch": 0.6056918812952072,
"wallclock": "2026-06-24T10:57:36.587027",
"loss": 0.053,
"grad_norm": 0.5415107011795044,
"learning_rate": 8.171437466363934e-06,
"step_time_sec": 107.16
},
{
"step": 1125,
"epoch": 0.6083958629081322,
"wallclock": "2026-06-24T10:59:22.605930",
"loss": 0.0612,
"grad_norm": 0.5975248217582703,
"learning_rate": 8.154479618570046e-06,
"step_time_sec": 106.02
},
{
"step": 1130,
"epoch": 0.6110998445210573,
"wallclock": "2026-06-24T11:01:07.592225",
"loss": 0.0632,
"grad_norm": 0.36586880683898926,
"learning_rate": 8.137461277928039e-06,
"step_time_sec": 104.99
},
{
"step": 1135,
"epoch": 0.6138038261339823,
"wallclock": "2026-06-24T11:02:52.501190",
"loss": 0.0638,
"grad_norm": 0.6821796298027039,
"learning_rate": 8.120382770795354e-06,
"step_time_sec": 104.91
},
{
"step": 1140,
"epoch": 0.6165078077469073,
"wallclock": "2026-06-24T11:04:37.321268",
"loss": 0.0672,
"grad_norm": 0.7406355142593384,
"learning_rate": 8.103244424683232e-06,
"step_time_sec": 104.82
},
{
"step": 1145,
"epoch": 0.6192117893598323,
"wallclock": "2026-06-24T11:06:23.997767",
"loss": 0.0551,
"grad_norm": 0.6757558584213257,
"learning_rate": 8.086046568250438e-06,
"step_time_sec": 106.68
},
{
"step": 1150,
"epoch": 0.6219157709727574,
"wallclock": "2026-06-24T11:08:10.298796",
"loss": 0.0585,
"grad_norm": 0.6179367899894714,
"learning_rate": 8.06878953129695e-06,
"step_time_sec": 106.3,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1155,
"epoch": 0.6246197525856824,
"wallclock": "2026-06-24T11:09:56.115384",
"loss": 0.0607,
"grad_norm": 0.9675397872924805,
"learning_rate": 8.051473644757644e-06,
"step_time_sec": 105.82
},
{
"step": 1160,
"epoch": 0.6273237341986074,
"wallclock": "2026-06-24T11:11:41.181597",
"loss": 0.0779,
"grad_norm": 0.6827834248542786,
"learning_rate": 8.034099240695942e-06,
"step_time_sec": 105.07
},
{
"step": 1165,
"epoch": 0.6300277158115325,
"wallclock": "2026-06-24T11:13:27.479419",
"loss": 0.0539,
"grad_norm": 0.43536603450775146,
"learning_rate": 8.016666652297443e-06,
"step_time_sec": 106.3
},
{
"step": 1170,
"epoch": 0.6327316974244576,
"wallclock": "2026-06-24T11:15:14.490172",
"loss": 0.0508,
"grad_norm": 0.5545168519020081,
"learning_rate": 7.999176213863536e-06,
"step_time_sec": 107.01
},
{
"step": 1175,
"epoch": 0.6354356790373825,
"wallclock": "2026-06-24T11:17:00.009949",
"loss": 0.0414,
"grad_norm": 0.43939608335494995,
"learning_rate": 7.981628260804992e-06,
"step_time_sec": 105.52
},
{
"step": 1180,
"epoch": 0.6381396606503076,
"wallclock": "2026-06-24T11:18:49.614174",
"loss": 0.0585,
"grad_norm": 0.7514466047286987,
"learning_rate": 7.964023129635528e-06,
"step_time_sec": 109.6
},
{
"step": 1185,
"epoch": 0.6408436422632326,
"wallclock": "2026-06-24T11:20:35.886257",
"loss": 0.0742,
"grad_norm": 1.2430953979492188,
"learning_rate": 7.946361157965354e-06,
"step_time_sec": 106.27
},
{
"step": 1190,
"epoch": 0.6435476238761576,
"wallclock": "2026-06-24T11:22:21.402749",
"loss": 0.0556,
"grad_norm": 0.6524196863174438,
"learning_rate": 7.928642684494696e-06,
"step_time_sec": 105.52
},
{
"step": 1195,
"epoch": 0.6462516054890827,
"wallclock": "2026-06-24T11:24:07.831963",
"loss": 0.066,
"grad_norm": 0.7481945157051086,
"learning_rate": 7.910868049007312e-06,
"step_time_sec": 106.43
},
{
"step": 1200,
"epoch": 0.6489555871020077,
"wallclock": "2026-06-24T11:25:53.724397",
"loss": 0.0712,
"grad_norm": 0.9145833849906921,
"learning_rate": 7.893037592363959e-06,
"step_time_sec": 105.89,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1200,
"epoch": 0.6489555871020077,
"wallclock": "2026-06-24T11:27:34.112458",
"eval_loss": 0.0755784884095192,
"eval_runtime": 100.3831,
"eval_samples_per_second": 4.981,
"eval_steps_per_second": 1.245,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1205,
"epoch": 0.6516595687149327,
"wallclock": "2026-06-24T11:30:41.010276",
"loss": 0.0515,
"grad_norm": 0.7790878415107727,
"learning_rate": 7.875151656495874e-06,
"step_time_sec": 287.29
},
{
"step": 1210,
"epoch": 0.6543635503278578,
"wallclock": "2026-06-24T11:32:27.481096",
"loss": 0.0566,
"grad_norm": 0.6310415267944336,
"learning_rate": 7.8572105843982e-06,
"step_time_sec": 106.47
},
{
"step": 1215,
"epoch": 0.6570675319407828,
"wallclock": "2026-06-24T11:34:13.893600",
"loss": 0.0451,
"grad_norm": 0.5569303631782532,
"learning_rate": 7.839214720123427e-06,
"step_time_sec": 106.41
},
{
"step": 1220,
"epoch": 0.6597715135537079,
"wallclock": "2026-06-24T11:35:58.515537",
"loss": 0.0688,
"grad_norm": 0.636441171169281,
"learning_rate": 7.821164408774772e-06,
"step_time_sec": 104.62
},
{
"step": 1225,
"epoch": 0.6624754951666328,
"wallclock": "2026-06-24T11:37:42.397948",
"loss": 0.0697,
"grad_norm": 0.7517639398574829,
"learning_rate": 7.803059996499584e-06,
"step_time_sec": 103.88
},
{
"step": 1230,
"epoch": 0.6651794767795579,
"wallclock": "2026-06-24T11:39:28.808346",
"loss": 0.0575,
"grad_norm": 0.5596706867218018,
"learning_rate": 7.78490183048269e-06,
"step_time_sec": 106.41
},
{
"step": 1235,
"epoch": 0.667883458392483,
"wallclock": "2026-06-24T11:41:14.200078",
"loss": 0.0586,
"grad_norm": 0.645969033241272,
"learning_rate": 7.76669025893974e-06,
"step_time_sec": 105.39
},
{
"step": 1240,
"epoch": 0.6705874400054079,
"wallclock": "2026-06-24T11:43:00.685648",
"loss": 0.0658,
"grad_norm": 0.7119715213775635,
"learning_rate": 7.748425631110536e-06,
"step_time_sec": 106.49
},
{
"step": 1245,
"epoch": 0.673291421618333,
"wallclock": "2026-06-24T11:44:47.803121",
"loss": 0.07,
"grad_norm": 1.2201249599456787,
"learning_rate": 7.730108297252328e-06,
"step_time_sec": 107.12
},
{
"step": 1250,
"epoch": 0.6759954032312581,
"wallclock": "2026-06-24T11:46:32.404383",
"loss": 0.0473,
"grad_norm": 0.7548292875289917,
"learning_rate": 7.7117386086331e-06,
"step_time_sec": 104.6,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1255,
"epoch": 0.678699384844183,
"wallclock": "2026-06-24T11:48:18.393499",
"loss": 0.0855,
"grad_norm": 1.1629971265792847,
"learning_rate": 7.693316917524832e-06,
"step_time_sec": 105.99
},
{
"step": 1260,
"epoch": 0.6814033664571081,
"wallclock": "2026-06-24T11:50:04.895492",
"loss": 0.0511,
"grad_norm": 0.798232913017273,
"learning_rate": 7.674843577196747e-06,
"step_time_sec": 106.5
},
{
"step": 1265,
"epoch": 0.6841073480700331,
"wallclock": "2026-06-24T11:51:50.622300",
"loss": 0.055,
"grad_norm": 0.5960519909858704,
"learning_rate": 7.656318941908534e-06,
"step_time_sec": 105.73
},
{
"step": 1270,
"epoch": 0.6868113296829581,
"wallclock": "2026-06-24T11:53:37.318998",
"loss": 0.069,
"grad_norm": 0.8142486810684204,
"learning_rate": 7.637743366903559e-06,
"step_time_sec": 106.7
},
{
"step": 1275,
"epoch": 0.6895153112958832,
"wallclock": "2026-06-24T11:55:24.097132",
"loss": 0.0486,
"grad_norm": 0.6205362677574158,
"learning_rate": 7.61911720840204e-06,
"step_time_sec": 106.78
},
{
"step": 1280,
"epoch": 0.6922192929088082,
"wallclock": "2026-06-24T11:57:09.718166",
"loss": 0.0653,
"grad_norm": 1.1235874891281128,
"learning_rate": 7.60044082359424e-06,
"step_time_sec": 105.62
},
{
"step": 1285,
"epoch": 0.6949232745217333,
"wallclock": "2026-06-24T11:58:56.721018",
"loss": 0.078,
"grad_norm": 0.8355940580368042,
"learning_rate": 7.581714570633586e-06,
"step_time_sec": 107.0
},
{
"step": 1290,
"epoch": 0.6976272561346583,
"wallclock": "2026-06-24T12:00:43.212505",
"loss": 0.0592,
"grad_norm": 2.8660950660705566,
"learning_rate": 7.562938808629829e-06,
"step_time_sec": 106.49
},
{
"step": 1295,
"epoch": 0.7003312377475833,
"wallclock": "2026-06-24T12:02:28.492093",
"loss": 0.058,
"grad_norm": 0.8744626045227051,
"learning_rate": 7.54411389764214e-06,
"step_time_sec": 105.28
},
{
"step": 1300,
"epoch": 0.7030352193605084,
"wallclock": "2026-06-24T12:04:14.101813",
"loss": 0.0608,
"grad_norm": 0.6016539931297302,
"learning_rate": 7.52524019867221e-06,
"step_time_sec": 105.61,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1300,
"epoch": 0.7030352193605084,
"wallclock": "2026-06-24T12:05:54.677349",
"eval_loss": 0.07137385755777359,
"eval_runtime": 100.5705,
"eval_samples_per_second": 4.972,
"eval_steps_per_second": 1.243,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1305,
"epoch": 0.7057392009734333,
"wallclock": "2026-06-24T12:08:54.483469",
"loss": 0.0709,
"grad_norm": 0.6734678149223328,
"learning_rate": 7.506318073657331e-06,
"step_time_sec": 280.38
},
{
"step": 1310,
"epoch": 0.7084431825863584,
"wallclock": "2026-06-24T12:10:38.696936",
"loss": 0.0623,
"grad_norm": 0.7641857266426086,
"learning_rate": 7.4873478854634476e-06,
"step_time_sec": 104.21
},
{
"step": 1315,
"epoch": 0.7111471641992835,
"wallclock": "2026-06-24T12:12:32.495225",
"loss": 0.0587,
"grad_norm": 0.8493006229400635,
"learning_rate": 7.4683299978782076e-06,
"step_time_sec": 113.8
},
{
"step": 1320,
"epoch": 0.7138511458122084,
"wallclock": "2026-06-24T12:14:17.906835",
"loss": 0.0474,
"grad_norm": 0.4841386377811432,
"learning_rate": 7.449264775603979e-06,
"step_time_sec": 105.41
},
{
"step": 1325,
"epoch": 0.7165551274251335,
"wallclock": "2026-06-24T12:16:05.319711",
"loss": 0.0603,
"grad_norm": 0.872616171836853,
"learning_rate": 7.430152584250856e-06,
"step_time_sec": 107.41
},
{
"step": 1330,
"epoch": 0.7192591090380586,
"wallclock": "2026-06-24T12:17:51.521993",
"loss": 0.0525,
"grad_norm": 0.7304244041442871,
"learning_rate": 7.410993790329652e-06,
"step_time_sec": 106.2
},
{
"step": 1335,
"epoch": 0.7219630906509835,
"wallclock": "2026-06-24T12:19:38.816730",
"loss": 0.0441,
"grad_norm": 0.5004603266716003,
"learning_rate": 7.3917887612448665e-06,
"step_time_sec": 107.29
},
{
"step": 1340,
"epoch": 0.7246670722639086,
"wallclock": "2026-06-24T12:21:24.716857",
"loss": 0.0669,
"grad_norm": 0.6454601287841797,
"learning_rate": 7.372537865287648e-06,
"step_time_sec": 105.9
},
{
"step": 1345,
"epoch": 0.7273710538768337,
"wallclock": "2026-06-24T12:23:10.411567",
"loss": 0.0422,
"grad_norm": 0.9636154174804688,
"learning_rate": 7.353241471628716e-06,
"step_time_sec": 105.69
},
{
"step": 1350,
"epoch": 0.7300750354897587,
"wallclock": "2026-06-24T12:24:56.017994",
"loss": 0.0456,
"grad_norm": 0.6495915651321411,
"learning_rate": 7.3338999503112975e-06,
"step_time_sec": 105.61,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1355,
"epoch": 0.7327790171026837,
"wallclock": "2026-06-24T12:26:41.498254",
"loss": 0.0547,
"grad_norm": 0.8502314686775208,
"learning_rate": 7.314513672244021e-06,
"step_time_sec": 105.48
},
{
"step": 1360,
"epoch": 0.7354829987156087,
"wallclock": "2026-06-24T12:28:28.488384",
"loss": 0.0607,
"grad_norm": 0.5915205478668213,
"learning_rate": 7.295083009193808e-06,
"step_time_sec": 106.99
},
{
"step": 1365,
"epoch": 0.7381869803285338,
"wallclock": "2026-06-24T12:30:17.161947",
"loss": 0.0654,
"grad_norm": 0.7883327603340149,
"learning_rate": 7.275608333778742e-06,
"step_time_sec": 108.67
},
{
"step": 1370,
"epoch": 0.7408909619414588,
"wallclock": "2026-06-24T12:32:05.107393",
"loss": 0.0552,
"grad_norm": 0.7381963133811951,
"learning_rate": 7.256090019460922e-06,
"step_time_sec": 107.95
},
{
"step": 1375,
"epoch": 0.7435949435543838,
"wallclock": "2026-06-24T12:33:52.735316",
"loss": 0.0649,
"grad_norm": 0.8336455821990967,
"learning_rate": 7.236528440539303e-06,
"step_time_sec": 107.63
},
{
"step": 1380,
"epoch": 0.7462989251673089,
"wallclock": "2026-06-24T12:35:40.395535",
"loss": 0.0393,
"grad_norm": 0.5212644338607788,
"learning_rate": 7.2169239721425154e-06,
"step_time_sec": 107.66
},
{
"step": 1385,
"epoch": 0.7490029067802338,
"wallclock": "2026-06-24T12:37:27.680116",
"loss": 0.0633,
"grad_norm": 0.8368508815765381,
"learning_rate": 7.197276990221677e-06,
"step_time_sec": 107.28
},
{
"step": 1390,
"epoch": 0.7517068883931589,
"wallclock": "2026-06-24T12:39:14.615045",
"loss": 0.0497,
"grad_norm": 0.7919797897338867,
"learning_rate": 7.177587871543172e-06,
"step_time_sec": 106.93
},
{
"step": 1395,
"epoch": 0.754410870006084,
"wallclock": "2026-06-24T12:41:03.202011",
"loss": 0.0564,
"grad_norm": 0.8120989799499512,
"learning_rate": 7.157856993681442e-06,
"step_time_sec": 108.59
},
{
"step": 1400,
"epoch": 0.757114851619009,
"wallclock": "2026-06-24T12:42:50.343742",
"loss": 0.0647,
"grad_norm": 0.6419529318809509,
"learning_rate": 7.138084735011727e-06,
"step_time_sec": 107.14,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1400,
"epoch": 0.757114851619009,
"wallclock": "2026-06-24T12:44:31.432549",
"eval_loss": 0.07097452133893967,
"eval_runtime": 101.0814,
"eval_samples_per_second": 4.947,
"eval_steps_per_second": 1.237,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1405,
"epoch": 0.759818833231934,
"wallclock": "2026-06-24T12:47:46.178602",
"loss": 0.0585,
"grad_norm": 0.5049331188201904,
"learning_rate": 7.118271474702828e-06,
"step_time_sec": 295.83
},
{
"step": 1410,
"epoch": 0.7625228148448591,
"wallclock": "2026-06-24T12:49:34.393720",
"loss": 0.0597,
"grad_norm": 0.8283151984214783,
"learning_rate": 7.098417592709819e-06,
"step_time_sec": 108.22
},
{
"step": 1415,
"epoch": 0.7652267964577841,
"wallclock": "2026-06-24T12:51:22.319403",
"loss": 0.052,
"grad_norm": 0.6273178458213806,
"learning_rate": 7.078523469766772e-06,
"step_time_sec": 107.93
},
{
"step": 1420,
"epoch": 0.7679307780707091,
"wallclock": "2026-06-24T12:53:09.791827",
"loss": 0.0504,
"grad_norm": 0.6756861805915833,
"learning_rate": 7.0585894873794514e-06,
"step_time_sec": 107.47
},
{
"step": 1425,
"epoch": 0.7706347596836342,
"wallclock": "2026-06-24T12:54:57.620245",
"loss": 0.0341,
"grad_norm": 0.5247818231582642,
"learning_rate": 7.038616027817998e-06,
"step_time_sec": 107.83
},
{
"step": 1430,
"epoch": 0.7733387412965592,
"wallclock": "2026-06-24T12:56:43.618402",
"loss": 0.0617,
"grad_norm": 0.5578892230987549,
"learning_rate": 7.018603474109601e-06,
"step_time_sec": 106.0
},
{
"step": 1435,
"epoch": 0.7760427229094842,
"wallclock": "2026-06-24T12:58:29.201049",
"loss": 0.0443,
"grad_norm": 0.8692203760147095,
"learning_rate": 6.9985522100311465e-06,
"step_time_sec": 105.58
},
{
"step": 1440,
"epoch": 0.7787467045224092,
"wallclock": "2026-06-24T13:00:16.509392",
"loss": 0.0692,
"grad_norm": 0.6393124461174011,
"learning_rate": 6.978462620101865e-06,
"step_time_sec": 107.31
},
{
"step": 1445,
"epoch": 0.7814506861353343,
"wallclock": "2026-06-24T13:02:04.488002",
"loss": 0.0524,
"grad_norm": 0.66062992811203,
"learning_rate": 6.958335089575952e-06,
"step_time_sec": 107.98
},
{
"step": 1450,
"epoch": 0.7841546677482593,
"wallclock": "2026-06-24T13:03:51.464399",
"loss": 0.0606,
"grad_norm": 0.3925676643848419,
"learning_rate": 6.938170004435186e-06,
"step_time_sec": 106.98,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1455,
"epoch": 0.7868586493611843,
"wallclock": "2026-06-24T13:05:39.613199",
"loss": 0.0539,
"grad_norm": 0.7175688147544861,
"learning_rate": 6.91796775138152e-06,
"step_time_sec": 108.15
},
{
"step": 1460,
"epoch": 0.7895626309741094,
"wallclock": "2026-06-24T13:07:26.686179",
"loss": 0.0571,
"grad_norm": 0.6691136360168457,
"learning_rate": 6.89772871782967e-06,
"step_time_sec": 107.07
},
{
"step": 1465,
"epoch": 0.7922666125870345,
"wallclock": "2026-06-24T13:09:14.110537",
"loss": 0.0749,
"grad_norm": 0.963224470615387,
"learning_rate": 6.877453291899685e-06,
"step_time_sec": 107.42
},
{
"step": 1470,
"epoch": 0.7949705941999594,
"wallclock": "2026-06-24T13:11:01.097029",
"loss": 0.0665,
"grad_norm": 0.702336311340332,
"learning_rate": 6.857141862409504e-06,
"step_time_sec": 106.99
},
{
"step": 1475,
"epoch": 0.7976745758128845,
"wallclock": "2026-06-24T13:12:48.608856",
"loss": 0.0502,
"grad_norm": 0.5416118502616882,
"learning_rate": 6.836794818867496e-06,
"step_time_sec": 107.51
},
{
"step": 1480,
"epoch": 0.8003785574258095,
"wallclock": "2026-06-24T13:14:36.033474",
"loss": 0.0441,
"grad_norm": 0.5691907405853271,
"learning_rate": 6.816412551464999e-06,
"step_time_sec": 107.42
},
{
"step": 1485,
"epoch": 0.8030825390387345,
"wallclock": "2026-06-24T13:16:24.108145",
"loss": 0.0626,
"grad_norm": 0.6911583542823792,
"learning_rate": 6.795995451068828e-06,
"step_time_sec": 108.07
},
{
"step": 1490,
"epoch": 0.8057865206516596,
"wallclock": "2026-06-24T13:18:10.811010",
"loss": 0.0563,
"grad_norm": 1.3713301420211792,
"learning_rate": 6.775543909213786e-06,
"step_time_sec": 106.7
},
{
"step": 1495,
"epoch": 0.8084905022645846,
"wallclock": "2026-06-24T13:19:58.721530",
"loss": 0.0483,
"grad_norm": 0.7632337212562561,
"learning_rate": 6.755058318095151e-06,
"step_time_sec": 107.91
},
{
"step": 1500,
"epoch": 0.8111944838775096,
"wallclock": "2026-06-24T13:21:45.388432",
"loss": 0.0697,
"grad_norm": 1.1038848161697388,
"learning_rate": 6.73453907056116e-06,
"step_time_sec": 106.67,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1500,
"epoch": 0.8111944838775096,
"wallclock": "2026-06-24T13:23:26.179575",
"eval_loss": 0.0741763636469841,
"eval_runtime": 100.7848,
"eval_samples_per_second": 4.961,
"eval_steps_per_second": 1.24,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1505,
"epoch": 0.8138984654904347,
"wallclock": "2026-06-24T13:26:37.503409",
"loss": 0.0618,
"grad_norm": 0.8092767000198364,
"learning_rate": 6.71398656010547e-06,
"step_time_sec": 292.11
},
{
"step": 1510,
"epoch": 0.8166024471033597,
"wallclock": "2026-06-24T13:28:25.707697",
"loss": 0.0432,
"grad_norm": 0.6367549300193787,
"learning_rate": 6.693401180859618e-06,
"step_time_sec": 108.2
},
{
"step": 1515,
"epoch": 0.8193064287162848,
"wallclock": "2026-06-24T13:30:13.512145",
"loss": 0.0434,
"grad_norm": 0.7922583222389221,
"learning_rate": 6.672783327585454e-06,
"step_time_sec": 107.8
},
{
"step": 1520,
"epoch": 0.8220104103292097,
"wallclock": "2026-06-24T13:32:01.383536",
"loss": 0.061,
"grad_norm": 0.7766749858856201,
"learning_rate": 6.65213339566758e-06,
"step_time_sec": 107.87
},
{
"step": 1525,
"epoch": 0.8247143919421348,
"wallclock": "2026-06-24T13:33:48.783229",
"loss": 0.0369,
"grad_norm": 0.5121834874153137,
"learning_rate": 6.631451781105767e-06,
"step_time_sec": 107.4
},
{
"step": 1530,
"epoch": 0.8274183735550599,
"wallclock": "2026-06-24T13:35:36.299372",
"loss": 0.0582,
"grad_norm": 0.726270318031311,
"learning_rate": 6.6107388805073495e-06,
"step_time_sec": 107.52
},
{
"step": 1535,
"epoch": 0.8301223551679848,
"wallclock": "2026-06-24T13:37:23.706841",
"loss": 0.0468,
"grad_norm": 0.6746184825897217,
"learning_rate": 6.589995091079636e-06,
"step_time_sec": 107.41
},
{
"step": 1540,
"epoch": 0.8328263367809099,
"wallclock": "2026-06-24T13:39:10.512639",
"loss": 0.0595,
"grad_norm": 0.8106797337532043,
"learning_rate": 6.569220810622281e-06,
"step_time_sec": 106.81
},
{
"step": 1545,
"epoch": 0.835530318393835,
"wallclock": "2026-06-24T13:40:56.792388",
"loss": 0.0539,
"grad_norm": 0.7323052287101746,
"learning_rate": 6.548416437519658e-06,
"step_time_sec": 106.28
},
{
"step": 1550,
"epoch": 0.8382343000067599,
"wallclock": "2026-06-24T13:42:44.227103",
"loss": 0.0491,
"grad_norm": 0.6671241521835327,
"learning_rate": 6.5275823707332275e-06,
"step_time_sec": 107.43,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1555,
"epoch": 0.840938281619685,
"wallclock": "2026-06-24T13:44:32.006313",
"loss": 0.0463,
"grad_norm": 1.1120103597640991,
"learning_rate": 6.50671900979387e-06,
"step_time_sec": 107.78
},
{
"step": 1560,
"epoch": 0.84364226323261,
"wallclock": "2026-06-24T13:46:19.183463",
"loss": 0.0542,
"grad_norm": 0.3518182039260864,
"learning_rate": 6.485826754794245e-06,
"step_time_sec": 107.18
},
{
"step": 1565,
"epoch": 0.846346244845535,
"wallclock": "2026-06-24T13:48:06.385481",
"loss": 0.0379,
"grad_norm": 0.7423526644706726,
"learning_rate": 6.464906006381101e-06,
"step_time_sec": 107.2
},
{
"step": 1570,
"epoch": 0.8490502264584601,
"wallclock": "2026-06-24T13:49:54.323330",
"loss": 0.0477,
"grad_norm": 0.7195249795913696,
"learning_rate": 6.443957165747601e-06,
"step_time_sec": 107.94
},
{
"step": 1575,
"epoch": 0.8517542080713851,
"wallclock": "2026-06-24T13:51:40.924711",
"loss": 0.058,
"grad_norm": 1.0430902242660522,
"learning_rate": 6.422980634625627e-06,
"step_time_sec": 106.6
},
{
"step": 1580,
"epoch": 0.8544581896843102,
"wallclock": "2026-06-24T13:53:26.389981",
"loss": 0.054,
"grad_norm": 0.8965272903442383,
"learning_rate": 6.4019768152780785e-06,
"step_time_sec": 105.47
},
{
"step": 1585,
"epoch": 0.8571621712972352,
"wallclock": "2026-06-24T13:55:13.357681",
"loss": 0.0538,
"grad_norm": 0.9105026125907898,
"learning_rate": 6.380946110491151e-06,
"step_time_sec": 106.97
},
{
"step": 1590,
"epoch": 0.8598661529101602,
"wallclock": "2026-06-24T13:57:00.092764",
"loss": 0.0405,
"grad_norm": 0.7773502469062805,
"learning_rate": 6.359888923566621e-06,
"step_time_sec": 106.74
},
{
"step": 1595,
"epoch": 0.8625701345230853,
"wallclock": "2026-06-24T13:58:47.207895",
"loss": 0.0522,
"grad_norm": 1.0928678512573242,
"learning_rate": 6.338805658314106e-06,
"step_time_sec": 107.12
},
{
"step": 1600,
"epoch": 0.8652741161360102,
"wallclock": "2026-06-24T14:00:34.321798",
"loss": 0.0346,
"grad_norm": 0.37700727581977844,
"learning_rate": 6.317696719043327e-06,
"step_time_sec": 107.11,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1600,
"epoch": 0.8652741161360102,
"wallclock": "2026-06-24T14:02:14.979680",
"eval_loss": 0.07683192193508148,
"eval_runtime": 100.6515,
"eval_samples_per_second": 4.968,
"eval_steps_per_second": 1.242,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1605,
"epoch": 0.8679780977489353,
"wallclock": "2026-06-24T14:05:26.609530",
"loss": 0.0504,
"grad_norm": 0.8107773065567017,
"learning_rate": 6.2965625105563445e-06,
"step_time_sec": 292.29
},
{
"step": 1610,
"epoch": 0.8706820793618604,
"wallclock": "2026-06-24T14:07:13.558796",
"loss": 0.0366,
"grad_norm": 1.109079122543335,
"learning_rate": 6.275403438139801e-06,
"step_time_sec": 106.95
},
{
"step": 1615,
"epoch": 0.8733860609747853,
"wallclock": "2026-06-24T14:09:00.902964",
"loss": 0.0516,
"grad_norm": 0.990442156791687,
"learning_rate": 6.254219907557159e-06,
"step_time_sec": 107.34
},
{
"step": 1620,
"epoch": 0.8760900425877104,
"wallclock": "2026-06-24T14:10:48.617807",
"loss": 0.0418,
"grad_norm": 0.7781974077224731,
"learning_rate": 6.2330123250409e-06,
"step_time_sec": 107.71
},
{
"step": 1625,
"epoch": 0.8787940242006355,
"wallclock": "2026-06-24T14:12:35.112434",
"loss": 0.0574,
"grad_norm": 1.2163763046264648,
"learning_rate": 6.211781097284754e-06,
"step_time_sec": 106.49
},
{
"step": 1630,
"epoch": 0.8814980058135604,
"wallclock": "2026-06-24T14:14:21.209253",
"loss": 0.0626,
"grad_norm": 0.9669123291969299,
"learning_rate": 6.190526631435882e-06,
"step_time_sec": 106.1
},
{
"step": 1635,
"epoch": 0.8842019874264855,
"wallclock": "2026-06-24T14:16:08.902128",
"loss": 0.04,
"grad_norm": 1.140141487121582,
"learning_rate": 6.169249335087085e-06,
"step_time_sec": 107.69
},
{
"step": 1640,
"epoch": 0.8869059690394105,
"wallclock": "2026-06-24T14:17:56.627117",
"loss": 0.0813,
"grad_norm": 1.00438392162323,
"learning_rate": 6.1479496162689775e-06,
"step_time_sec": 107.72
},
{
"step": 1645,
"epoch": 0.8896099506523356,
"wallclock": "2026-06-24T14:19:43.930288",
"loss": 0.051,
"grad_norm": 1.1830681562423706,
"learning_rate": 6.1266278834421634e-06,
"step_time_sec": 107.3
},
{
"step": 1650,
"epoch": 0.8923139322652606,
"wallclock": "2026-06-24T14:21:30.620817",
"loss": 0.048,
"grad_norm": 0.7539001107215881,
"learning_rate": 6.105284545489408e-06,
"step_time_sec": 106.69,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1655,
"epoch": 0.8950179138781856,
"wallclock": "2026-06-24T14:23:17.090603",
"loss": 0.044,
"grad_norm": 0.8819478750228882,
"learning_rate": 6.083920011707794e-06,
"step_time_sec": 106.47
},
{
"step": 1660,
"epoch": 0.8977218954911107,
"wallclock": "2026-06-24T14:25:03.986530",
"loss": 0.0369,
"grad_norm": 0.6605198383331299,
"learning_rate": 6.062534691800865e-06,
"step_time_sec": 106.9
},
{
"step": 1665,
"epoch": 0.9004258771040357,
"wallclock": "2026-06-24T14:26:50.818521",
"loss": 0.0434,
"grad_norm": 0.5586560368537903,
"learning_rate": 6.04112899587079e-06,
"step_time_sec": 106.83
},
{
"step": 1670,
"epoch": 0.9031298587169607,
"wallclock": "2026-06-24T14:28:38.292626",
"loss": 0.0537,
"grad_norm": 0.6612546443939209,
"learning_rate": 6.019703334410473e-06,
"step_time_sec": 107.47
},
{
"step": 1675,
"epoch": 0.9058338403298858,
"wallclock": "2026-06-24T14:30:25.839650",
"loss": 0.045,
"grad_norm": 0.8835639357566833,
"learning_rate": 5.998258118295699e-06,
"step_time_sec": 107.55
},
{
"step": 1680,
"epoch": 0.9085378219428107,
"wallclock": "2026-06-24T14:32:15.497667",
"loss": 0.0351,
"grad_norm": 0.7877563834190369,
"learning_rate": 5.9767937587772464e-06,
"step_time_sec": 109.66
},
{
"step": 1685,
"epoch": 0.9112418035557358,
"wallclock": "2026-06-24T14:34:02.801842",
"loss": 0.0423,
"grad_norm": 0.8421223759651184,
"learning_rate": 5.955310667473003e-06,
"step_time_sec": 107.3
},
{
"step": 1690,
"epoch": 0.9139457851686609,
"wallclock": "2026-06-24T14:35:49.804897",
"loss": 0.0549,
"grad_norm": 0.9553209543228149,
"learning_rate": 5.933809256360076e-06,
"step_time_sec": 107.0
},
{
"step": 1695,
"epoch": 0.9166497667815859,
"wallclock": "2026-06-24T14:37:39.298110",
"loss": 0.0365,
"grad_norm": 0.9886178374290466,
"learning_rate": 5.912289937766882e-06,
"step_time_sec": 109.49
},
{
"step": 1700,
"epoch": 0.9193537483945109,
"wallclock": "2026-06-24T14:39:25.987650",
"loss": 0.0488,
"grad_norm": 0.7625762820243835,
"learning_rate": 5.890753124365252e-06,
"step_time_sec": 106.69,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1700,
"epoch": 0.9193537483945109,
"wallclock": "2026-06-24T14:41:06.647821",
"eval_loss": 0.0766952782869339,
"eval_runtime": 100.6532,
"eval_samples_per_second": 4.968,
"eval_steps_per_second": 1.242,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1700,
"epoch": 0.9193537483945109,
"wallclock": "2026-06-24T14:42:31.431895",
"train_runtime": 39166.0066,
"train_samples_per_second": 3.021,
"train_steps_per_second": 0.094,
"total_flos": 5114610608766976.0,
"train_loss": 0.08617227443877389,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 33.45,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
},
{
"step": 1700,
"epoch": 0.9193537483945109,
"wallclock": "2026-06-24T14:44:29.019553",
"eval_loss": 0.07097452133893967,
"eval_runtime": 100.1105,
"eval_samples_per_second": 4.994,
"eval_steps_per_second": 1.249,
"gpu": [
{
"gpu": 0,
"mem_allocated_gb": 39.05,
"mem_reserved_gb": 80.99
},
{
"gpu": 1,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 2,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
},
{
"gpu": 3,
"mem_allocated_gb": 0.0,
"mem_reserved_gb": 0.0
}
]
}
]